Search in sources :

Example 26 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class MatrixIndexingSPInstruction method singleBlockIndexing.

private static MatrixBlock singleBlockIndexing(JavaPairRDD<MatrixIndexes, MatrixBlock> in1, MatrixCharacteristics mcIn, MatrixCharacteristics mcOut, IndexRange ixrange) throws DMLRuntimeException {
    //single block output via lookup (on partitioned inputs, this allows for single partition
    //access to avoid a full scan of the input; note that this is especially important for 
    //out-of-core datasets as entire partitions are read, not just keys as in the in-memory setting.
    long rix = UtilFunctions.computeBlockIndex(ixrange.rowStart, mcIn.getRowsPerBlock());
    long cix = UtilFunctions.computeBlockIndex(ixrange.colStart, mcIn.getColsPerBlock());
    List<MatrixBlock> list = in1.lookup(new MatrixIndexes(rix, cix));
    if (list.size() != 1)
        throw new DMLRuntimeException("Block lookup returned " + list.size() + " blocks (expected 1).");
    MatrixBlock tmp = list.get(0);
    MatrixBlock mbout = (tmp.getNumRows() == mcOut.getRows() && tmp.getNumColumns() == mcOut.getCols()) ? tmp : //reference full block or slice out sub-block
    tmp.sliceOperations(UtilFunctions.computeCellInBlock(ixrange.rowStart, mcIn.getRowsPerBlock()), UtilFunctions.computeCellInBlock(ixrange.rowEnd, mcIn.getRowsPerBlock()), UtilFunctions.computeCellInBlock(ixrange.colStart, mcIn.getColsPerBlock()), UtilFunctions.computeCellInBlock(ixrange.colEnd, mcIn.getColsPerBlock()), new MatrixBlock());
    return mbout;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 27 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class MapmmSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    CacheType type = _type;
    String rddVar = type.isRight() ? input1.getName() : input2.getName();
    String bcastVar = type.isRight() ? input2.getName() : input1.getName();
    MatrixCharacteristics mcRdd = sec.getMatrixCharacteristics(rddVar);
    MatrixCharacteristics mcBc = sec.getMatrixCharacteristics(bcastVar);
    //get input rdd
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar);
    //inputs - is required to ensure moderately sized output partitions (2GB limitation)
    if (requiresFlatMapFunction(type, mcBc) && requiresRepartitioning(type, mcRdd, mcBc, in1.getNumPartitions())) {
        int numParts = getNumRepartitioning(type, mcRdd, mcBc);
        int numParts2 = getNumRepartitioning(type.getFlipped(), mcBc, mcRdd);
        if (numParts2 > numParts) {
            //flip required
            type = type.getFlipped();
            rddVar = type.isRight() ? input1.getName() : input2.getName();
            bcastVar = type.isRight() ? input2.getName() : input1.getName();
            mcRdd = sec.getMatrixCharacteristics(rddVar);
            mcBc = sec.getMatrixCharacteristics(bcastVar);
            in1 = sec.getBinaryBlockRDDHandleForVariable(rddVar);
            LOG.warn("Mapmm: Switching rdd ('" + bcastVar + "') and broadcast ('" + rddVar + "') inputs " + "for repartitioning because this allows better control of output partition " + "sizes (" + numParts + " < " + numParts2 + ").");
        }
    }
    //get inputs
    PartitionedBroadcast<MatrixBlock> in2 = sec.getBroadcastForVariable(bcastVar);
    //empty input block filter
    if (!_outputEmpty)
        in1 = in1.filter(new FilterNonEmptyBlocksFunction());
    //execute mapmm and aggregation if necessary and put output into symbol table
    if (_aggtype == SparkAggType.SINGLE_BLOCK) {
        JavaRDD<MatrixBlock> out = in1.map(new RDDMapMMFunction2(type, in2));
        MatrixBlock out2 = RDDAggregateUtils.sumStable(out);
        //put output block into symbol table (no lineage because single block)
        //this also includes implicit maintenance of matrix characteristics
        sec.setMatrixOutput(output.getName(), out2);
    } else //MULTI_BLOCK or NONE
    {
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
        if (requiresFlatMapFunction(type, mcBc)) {
            if (requiresRepartitioning(type, mcRdd, mcBc, in1.getNumPartitions())) {
                int numParts = getNumRepartitioning(type, mcRdd, mcBc);
                LOG.warn("Mapmm: Repartition input rdd '" + rddVar + "' from " + in1.getNumPartitions() + " to " + numParts + " partitions to satisfy size restrictions of output partitions.");
                in1 = in1.repartition(numParts);
            }
            out = in1.flatMapToPair(new RDDFlatMapMMFunction(type, in2));
        } else if (preservesPartitioning(mcRdd, type))
            out = in1.mapPartitionsToPair(new RDDMapMMPartitionFunction(type, in2), true);
        else
            out = in1.mapToPair(new RDDMapMMFunction(type, in2));
        //empty output block filter
        if (!_outputEmpty)
            out = out.filter(new FilterNonEmptyBlocksFunction());
        if (_aggtype == SparkAggType.MULTI_BLOCK)
            out = RDDAggregateUtils.sumByKeyStable(out, false);
        //put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), rddVar);
        sec.addLineageBroadcast(output.getName(), bcastVar);
        //update output statistics if not inferred
        updateBinaryMMOutputMatrixCharacteristics(sec, true);
    }
}
Also used : FilterNonEmptyBlocksFunction(org.apache.sysml.runtime.instructions.spark.functions.FilterNonEmptyBlocksFunction) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) CacheType(org.apache.sysml.lops.MapMult.CacheType) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Example 28 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class CumulativeOffsetSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input2.getName());
    long rlen = mc.getRows();
    int brlen = mc.getRowsPerBlock();
    //get inputs
    JavaPairRDD<MatrixIndexes, MatrixBlock> inData = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> inAgg = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
    //prepare aggregates (cumsplit of offsets)
    inAgg = inAgg.flatMapToPair(new RDDCumSplitFunction(_initValue, rlen, brlen));
    //execute cumulative offset (apply cumulative op w/ offsets)
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = inData.join(inAgg).mapValues(new RDDCumOffsetFunction(_uop, _bop));
    updateUnaryOutputMatrixCharacteristics(sec);
    //put output handle in symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
    sec.addLineageRDD(output.getName(), input2.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 29 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class CumulativeAggregateSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    long rlen = mc.getRows();
    int brlen = mc.getRowsPerBlock();
    int bclen = mc.getColsPerBlock();
    //get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    //execute unary aggregate (w/ implicit drop correction)
    AggregateUnaryOperator auop = (AggregateUnaryOperator) _optr;
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = in.mapToPair(new RDDCumAggFunction(auop, rlen, brlen, bclen));
    out = RDDAggregateUtils.mergeByKey(out, false);
    //put output handle in symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) AggregateUnaryOperator(org.apache.sysml.runtime.matrix.operators.AggregateUnaryOperator) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 30 with MatrixIndexes

use of org.apache.sysml.runtime.matrix.data.MatrixIndexes in project incubator-systemml by apache.

the class IndexSortStitchupMapper method configure.

@Override
public void configure(JobConf job) {
    super.configure(job);
    _offsets = parseOffsets(job.get(SortMR.SORT_INDEXES_OFFSETS));
    _rlen = MRJobConfiguration.getNumRows(job, (byte) 0);
    _brlen = MRJobConfiguration.getNumRowsPerBlock(job, (byte) 0);
    _tmpIx = new MatrixIndexes();
    _tmpBlk = new MatrixBlock((int) _brlen, 1, false);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes)

Aggregations

MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)144 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)121 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)57 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)44 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)38 Path (org.apache.hadoop.fs.Path)21 SequenceFile (org.apache.hadoop.io.SequenceFile)21 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)20 IOException (java.io.IOException)19 ArrayList (java.util.ArrayList)18 FileSystem (org.apache.hadoop.fs.FileSystem)18 MatrixCell (org.apache.sysml.runtime.matrix.data.MatrixCell)18 Tuple2 (scala.Tuple2)17 IndexedMatrixValue (org.apache.sysml.runtime.matrix.mapred.IndexedMatrixValue)15 JobConf (org.apache.hadoop.mapred.JobConf)11 MatrixValue (org.apache.sysml.runtime.matrix.data.MatrixValue)10 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)9 File (java.io.File)7 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)7 RecordReader (org.apache.hadoop.mapred.RecordReader)6