Search in sources :

Example 16 with MetaDataFormat

use of org.apache.sysml.runtime.matrix.MetaDataFormat in project incubator-systemml by apache.

the class DataPartitioner method createPartitionedMatrixObject.

/**
 * Creates a partitioned matrix object based on the given input matrix object,
 * according to the specified split format. The input matrix can be in-memory
 * or still on HDFS and the partitioned output matrix is written to HDFS. The
 * created matrix object can be used transparently for obtaining the full matrix
 * or reading 1 or multiple partitions based on given index ranges.
 *
 * @param in input matrix object
 * @param out output matrix object
 * @param force if false, try to optimize
 * @return partitioned matrix object
 */
public MatrixObject createPartitionedMatrixObject(MatrixObject in, MatrixObject out, boolean force) {
    // check for naive partitioning
    if (_format == PDataPartitionFormat.NONE)
        return in;
    // analyze input matrix object
    MetaDataFormat meta = (MetaDataFormat) in.getMetaData();
    MatrixCharacteristics mc = meta.getMatrixCharacteristics();
    InputInfo ii = meta.getInputInfo();
    OutputInfo oi = meta.getOutputInfo();
    long rows = mc.getRows();
    long cols = mc.getCols();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    long nonZeros = mc.getNonZeros();
    double sparsity = mc.dimsKnown(true) ? ((double) nonZeros) / (rows * cols) : 1.0;
    if (// try to optimize, if format not forced
    !force) {
        // check lower bound of useful data partitioning
        if (// or matrix already fits in mem
        rows < Hop.CPThreshold && cols < Hop.CPThreshold) {
            return in;
        }
        // check for changing to blockwise representations
        if (_format == PDataPartitionFormat.ROW_WISE && cols < Hop.CPThreshold) {
            LOG.debug("Changing format from " + PDataPartitionFormat.ROW_WISE + " to " + PDataPartitionFormat.ROW_BLOCK_WISE + ".");
            _format = PDataPartitionFormat.ROW_BLOCK_WISE;
        }
        if (_format == PDataPartitionFormat.COLUMN_WISE && rows < Hop.CPThreshold) {
            LOG.debug("Changing format from " + PDataPartitionFormat.COLUMN_WISE + " to " + PDataPartitionFormat.ROW_BLOCK_WISE + ".");
            _format = PDataPartitionFormat.COLUMN_BLOCK_WISE;
        }
    // _format = PDataPartitionFormat.ROW_BLOCK_WISE_N;
    }
    // check changing to binarycell in case of sparse cols (robustness)
    boolean convertBlock2Cell = false;
    if (ii == InputInfo.BinaryBlockInputInfo && _allowBinarycell && _format == PDataPartitionFormat.COLUMN_WISE && sparsity < SPARSITY_CELL_THRESHOLD) {
        LOG.debug("Changing partition outputinfo from binaryblock to binarycell due to sparsity=" + sparsity);
        oi = OutputInfo.BinaryCellOutputInfo;
        convertBlock2Cell = true;
    }
    // prepare filenames and cleanup if required
    String fnameNew = out.getFileName();
    try {
        MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    // core partitioning (depending on subclass)
    partitionMatrix(in, fnameNew, ii, oi, rows, cols, brlen, bclen);
    // create output matrix object
    out.setPartitioned(_format, _n);
    MatrixCharacteristics mcNew = new MatrixCharacteristics(rows, cols, (int) brlen, (int) bclen);
    mcNew.setNonZeros(nonZeros);
    if (convertBlock2Cell)
        ii = InputInfo.BinaryCellInputInfo;
    MetaDataFormat metaNew = new MetaDataFormat(mcNew, oi, ii);
    out.setMetaData(metaNew);
    return out;
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 17 with MetaDataFormat

use of org.apache.sysml.runtime.matrix.MetaDataFormat in project incubator-systemml by apache.

the class CSVReblockSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // sanity check input info
    CacheableData<?> obj = sec.getCacheableData(input1.getName());
    MetaDataFormat iimd = (MetaDataFormat) obj.getMetaData();
    if (iimd.getInputInfo() != InputInfo.CSVInputInfo) {
        throw new DMLRuntimeException("The given InputInfo is not implemented for " + "CSVReblockSPInstruction:" + iimd.getInputInfo());
    }
    // set output characteristics
    MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    mcOut.set(mcIn.getRows(), mcIn.getCols(), _brlen, _bclen);
    // check for in-memory reblock (w/ lazy spark context, potential for latency reduction)
    if (Recompiler.checkCPReblock(sec, input1.getName())) {
        if (input1.getDataType() == DataType.MATRIX)
            Recompiler.executeInMemoryMatrixReblock(sec, input1.getName(), output.getName());
        else if (input1.getDataType() == DataType.FRAME)
            Recompiler.executeInMemoryFrameReblock(sec, input1.getName(), output.getName());
        return;
    }
    // execute matrix/frame csvreblock
    JavaPairRDD<?, ?> out = null;
    if (input1.getDataType() == DataType.MATRIX)
        out = processMatrixCSVReblockInstruction(sec, mcOut);
    else if (input1.getDataType() == DataType.FRAME)
        out = processFrameCSVReblockInstruction(sec, mcOut, ((FrameObject) obj).getSchema());
    // put output RDD handle into symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 18 with MetaDataFormat

use of org.apache.sysml.runtime.matrix.MetaDataFormat in project incubator-systemml by apache.

the class InterProceduralAnalysis method createOutputMatrix.

private static MatrixObject createOutputMatrix(long dim1, long dim2, long nnz) {
    MatrixObject moOut = new MatrixObject(ValueType.DOUBLE, null);
    MatrixCharacteristics mc = new MatrixCharacteristics(dim1, dim2, ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize(), nnz);
    MetaDataFormat meta = new MetaDataFormat(mc, null, null);
    moOut.setMetaData(meta);
    return moOut;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 19 with MetaDataFormat

use of org.apache.sysml.runtime.matrix.MetaDataFormat in project incubator-systemml by apache.

the class Recompiler method checkCPReblock.

/**
 * Returns true iff (1) all instruction are reblock instructions and (2) all
 * individual reblock operations fit in the current memory budget.
 *
 * @param inst instruction
 * @param inputs the inputs
 * @return return true if and only if all instructions are reblock instructions and all
 * individual reblock oeprations fir in the current memory budget.
 * @throws IOException if IOException occurs
 */
public static boolean checkCPReblock(MRJobInstruction inst, MatrixObject[] inputs) throws IOException {
    boolean ret = true;
    boolean localMode = InfrastructureAnalyzer.isLocalMode();
    // check only shuffle inst
    String rdInst = inst.getIv_randInstructions();
    String rrInst = inst.getIv_recordReaderInstructions();
    String mapInst = inst.getIv_instructionsInMapper();
    String aggInst = inst.getIv_aggInstructions();
    String otherInst = inst.getIv_otherInstructions();
    if ((rdInst != null && rdInst.length() > 0) || (rrInst != null && rrInst.length() > 0) || (mapInst != null && mapInst.length() > 0) || (aggInst != null && aggInst.length() > 0) || (otherInst != null && otherInst.length() > 0)) {
        ret = false;
    }
    // check only reblock inst
    if (ret) {
        String shuffleInst = inst.getIv_shuffleInstructions();
        String[] instParts = shuffleInst.split(Lop.INSTRUCTION_DELIMITOR);
        for (String rblk : instParts) if (!InstructionUtils.getOpCode(rblk).equals(ReBlock.OPCODE) && !InstructionUtils.getOpCode(rblk).equals(CSVReBlock.OPCODE)) {
            ret = false;
            break;
        }
    }
    // counter-productive because any export from CP would reintroduce the empty blocks)
    if (ret) {
        String shuffleInst = inst.getIv_shuffleInstructions();
        String[] instParts = shuffleInst.split(Lop.INSTRUCTION_DELIMITOR);
        for (String rblk : instParts) if (InstructionUtils.getOpCode(rblk).equals(ReBlock.OPCODE) && // no output of empty blocks
        rblk.endsWith("false")) {
            ret = false;
            break;
        }
    }
    // check recompile memory budget
    if (ret) {
        for (MatrixObject mo : inputs) {
            long rows = mo.getNumRows();
            long cols = mo.getNumColumns();
            // however, we do a conservative check with the CSV filesize
            if (rows == -1 || cols == -1) {
                Path path = new Path(mo.getFileName());
                long size = MapReduceTool.getFilesizeOnHDFS(path);
                if (size > CP_CSV_REBLOCK_UNKNOWN_THRESHOLD_SIZE || CP_CSV_REBLOCK_UNKNOWN_THRESHOLD_SIZE > OptimizerUtils.getLocalMemBudget()) {
                    ret = false;
                    break;
                }
            } else // default case (known dimensions)
            {
                long nnz = mo.getNnz();
                double sp = OptimizerUtils.getSparsity(rows, cols, nnz);
                double mem = MatrixBlock.estimateSizeInMemory(rows, cols, sp);
                if (!OptimizerUtils.isValidCPDimensions(rows, cols) || !OptimizerUtils.isValidCPMatrixSize(rows, cols, sp) || mem >= OptimizerUtils.getLocalMemBudget()) {
                    ret = false;
                    break;
                }
            }
        }
    }
    // NOTE: this does not apply to local mode because there text read single-threaded as well
    if (ret && !localMode) {
        for (MatrixObject mo : inputs) {
            MetaDataFormat iimd = (MetaDataFormat) mo.getMetaData();
            if ((iimd.getInputInfo() == InputInfo.TextCellInputInfo || iimd.getInputInfo() == InputInfo.MatrixMarketInputInfo || iimd.getInputInfo() == InputInfo.CSVInputInfo || iimd.getInputInfo() == InputInfo.BinaryCellInputInfo) && !mo.isDirty()) {
                // get file size on hdfs (as indicator for estimated read time)
                Path path = new Path(mo.getFileName());
                long fileSize = MapReduceTool.getFilesizeOnHDFS(path);
                // compute cp reblock size threshold based on available parallelism
                long cpThreshold = CP_REBLOCK_THRESHOLD_SIZE * OptimizerUtils.getParallelTextReadParallelism();
                if (fileSize > cpThreshold) {
                    ret = false;
                    break;
                }
            }
        }
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject)

Example 20 with MetaDataFormat

use of org.apache.sysml.runtime.matrix.MetaDataFormat in project incubator-systemml by apache.

the class CacheableData method toString.

@Override
public String toString() {
    StringBuilder str = new StringBuilder();
    str.append(getClass().getSimpleName());
    str.append(": ");
    str.append(_hdfsFileName + ", ");
    if (_metaData instanceof MetaDataNumItemsByEachReducer) {
        str.append("NumItemsByEachReducerMetaData");
    } else {
        try {
            MetaDataFormat md = (MetaDataFormat) _metaData;
            if (md != null) {
                MatrixCharacteristics mc = _metaData.getMatrixCharacteristics();
                str.append(mc.toString());
                InputInfo ii = md.getInputInfo();
                if (ii == null)
                    str.append("null");
                else {
                    str.append(", ");
                    str.append(InputInfo.inputInfoToString(ii));
                }
            } else {
                str.append("null, null");
            }
        } catch (Exception ex) {
            LOG.error(ex);
        }
    }
    str.append(", ");
    str.append(isDirty() ? "dirty" : "not-dirty");
    return str.toString();
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) MetaDataNumItemsByEachReducer(org.apache.sysml.runtime.matrix.MetaDataNumItemsByEachReducer) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Aggregations

MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)54 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)47 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)28 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)26 OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)17 IOException (java.io.IOException)12 ValueType (org.apache.sysml.parser.Expression.ValueType)10 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)10 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)9 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)9 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)7 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)5 Path (org.apache.hadoop.fs.Path)4 LongWritable (org.apache.hadoop.io.LongWritable)4 Text (org.apache.hadoop.io.Text)4 Data (org.apache.sysml.runtime.instructions.cp.Data)4 ScalarObject (org.apache.sysml.runtime.instructions.cp.ScalarObject)4 ConvertStringToLongTextPair (org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair)4 CopyTextInputFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction)4 DataOp (org.apache.sysml.hops.DataOp)3