Search in sources :

Example 1 with PartitionedBlock

use of org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock in project incubator-systemml by apache.

the class PMapmmSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // get inputs
    JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
    MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(input1.getName());
    // This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
    // Ideally, we should ensure that we donot redundantly call persist on the same RDD.
    StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
    // cache right hand side because accessed many times
    in2 = in2.repartition(sec.getSparkContext().defaultParallelism()).persist(pmapmmStorageLevel);
    JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
    for (int i = 0; i < mc1.getRows(); i += NUM_ROWBLOCKS * mc1.getRowsPerBlock()) {
        // create broadcast for rdd partition
        JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = in1.filter(new IsBlockInRange(i + 1, i + NUM_ROWBLOCKS * mc1.getRowsPerBlock(), 1, mc1.getCols(), mc1)).mapToPair(new PMapMMRebaseBlocksFunction(i / mc1.getRowsPerBlock()));
        int rlen = (int) Math.min(mc1.getRows() - i, NUM_ROWBLOCKS * mc1.getRowsPerBlock());
        PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int) mc1.getCols(), mc1.getRowsPerBlock(), mc1.getColsPerBlock(), -1L);
        Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
        // matrix multiplication
        JavaPairRDD<MatrixIndexes, MatrixBlock> rdd2 = in2.flatMapToPair(new PMapMMFunction(bpmb, i / mc1.getRowsPerBlock()));
        rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
        rdd2.persist(pmapmmStorageLevel).count();
        bpmb.unpersist(false);
        if (out == null)
            out = rdd2;
        else
            out = out.union(rdd2);
    }
    // cache final result
    out = out.persist(pmapmmStorageLevel);
    out.count();
    // put output RDD handle into symbol table
    sec.setRDDHandleForVariable(output.getName(), out);
    sec.addLineageRDD(output.getName(), input1.getName());
    sec.addLineageRDD(output.getName(), input2.getName());
    // update output statistics if not inferred
    updateBinaryMMOutputMatrixCharacteristics(sec, true);
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) IsBlockInRange(org.apache.sysml.runtime.instructions.spark.functions.IsBlockInRange) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) PartitionedBlock(org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) StorageLevel(org.apache.spark.storage.StorageLevel)

Example 2 with PartitionedBlock

use of org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock in project incubator-systemml by apache.

the class Tsmm2SPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // get input
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    // execute tsmm2 instruction
    // step 1: first pass of X, filter-collect-broadcast excess blocks
    JavaPairRDD<MatrixIndexes, MatrixBlock> tmp1 = in.filter(new IsBlockInRange(_type.isLeft() ? 1 : mc.getRowsPerBlock() + 1, mc.getRows(), _type.isLeft() ? mc.getColsPerBlock() + 1 : 1, mc.getCols(), mc)).mapToPair(new ShiftTSMMIndexesFunction(_type));
    PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(tmp1, (int) (_type.isLeft() ? mc.getRows() : mc.getRows() - mc.getRowsPerBlock()), (int) (_type.isLeft() ? mc.getCols() - mc.getColsPerBlock() : mc.getCols()), mc.getRowsPerBlock(), mc.getColsPerBlock(), -1L);
    Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
    // step 2: second pass of X, compute tsmm/mapmm and aggregate result blocks
    int outputDim = (int) (_type.isLeft() ? mc.getCols() : mc.getRows());
    if (OptimizerUtils.estimateSize(outputDim, outputDim) <= 32 * 1024 * 1024) {
        // default: <=32MB
        // output large blocks and reduceAll to avoid skew on combineByKey
        JavaRDD<MatrixBlock> tmp2 = in.map(new RDDTSMM2ExtFunction(bpmb, _type, outputDim, (int) mc.getRowsPerBlock()));
        MatrixBlock out = RDDAggregateUtils.sumStable(tmp2);
        // put output block into symbol table (no lineage because single block)
        // this also includes implicit maintenance of matrix characteristics
        sec.setMatrixOutput(output.getName(), out, getExtendedOpcode());
    } else {
        // output individual output blocks and aggregate by key (no action)
        JavaPairRDD<MatrixIndexes, MatrixBlock> tmp2 = in.flatMapToPair(new RDDTSMM2Function(bpmb, _type));
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDAggregateUtils.sumByKeyStable(tmp2, false);
        // put output RDD handle into symbol table
        sec.getMatrixCharacteristics(output.getName()).set(outputDim, outputDim, mc.getRowsPerBlock(), mc.getColsPerBlock());
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) IsBlockInRange(org.apache.sysml.runtime.instructions.spark.functions.IsBlockInRange) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) PartitionedBlock(org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)

Example 3 with PartitionedBlock

use of org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock in project incubator-systemml by apache.

the class SparkExecutionContext method toPartitionedMatrixBlock.

public static PartitionedBlock<MatrixBlock> toPartitionedMatrixBlock(JavaPairRDD<MatrixIndexes, MatrixBlock> rdd, int rlen, int clen, int brlen, int bclen, long nnz) {
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    PartitionedBlock<MatrixBlock> out = new PartitionedBlock<>(rlen, clen, brlen, bclen);
    List<Tuple2<MatrixIndexes, MatrixBlock>> list = rdd.collect();
    // copy blocks one-at-a-time into output matrix block
    for (Tuple2<MatrixIndexes, MatrixBlock> keyval : list) {
        // unpack index-block pair
        MatrixIndexes ix = keyval._1();
        MatrixBlock block = keyval._2();
        out.setBlock((int) ix.getRowIndex(), (int) ix.getColumnIndex(), block);
    }
    if (DMLScript.STATISTICS) {
        Statistics.accSparkCollectTime(System.nanoTime() - t0);
        Statistics.incSparkCollectCount(1);
    }
    return out;
}
Also used : PartitionedBlock(org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CompressedMatrixBlock(org.apache.sysml.runtime.compress.CompressedMatrixBlock) Tuple2(scala.Tuple2) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes)

Example 4 with PartitionedBlock

use of org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock in project incubator-systemml by apache.

the class SparkExecutionContext method rCleanupLineageObject.

@SuppressWarnings({ "rawtypes", "unchecked" })
private void rCleanupLineageObject(LineageObject lob) throws IOException {
    // abort recursive cleanup if still consumers
    if (lob.getNumReferences() > 0)
        return;
    // robustness in function calls and to prevent repeated scans of the symbol table)
    if (lob.hasBackReference())
        return;
    // incl deferred hdfs file removal (only if metadata set by cleanup call)
    if (lob instanceof RDDObject) {
        RDDObject rdd = (RDDObject) lob;
        int rddID = rdd.getRDD().id();
        cleanupRDDVariable(rdd.getRDD());
        if (rdd.getHDFSFilename() != null) {
            // deferred file removal
            MapReduceTool.deleteFileWithMTDIfExistOnHDFS(rdd.getHDFSFilename());
        }
        if (rdd.isParallelizedRDD())
            _parRDDs.deregisterRDD(rddID);
    } else if (lob instanceof BroadcastObject) {
        PartitionedBroadcast pbm = ((BroadcastObject) lob).getBroadcast();
        if (// robustness for evictions
        pbm != null)
            for (Broadcast<PartitionedBlock> bc : pbm.getBroadcasts()) cleanupBroadcastVariable(bc);
        CacheableData.addBroadcastSize(-((BroadcastObject) lob).getSize());
    }
    // recursively process lineage children
    for (LineageObject c : lob.getLineageChilds()) {
        c.decrementNumReferences();
        rCleanupLineageObject(c);
    }
}
Also used : PartitionedBlock(org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock) PartitionedBroadcast(org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) LineageObject(org.apache.sysml.runtime.instructions.spark.data.LineageObject) Checkpoint(org.apache.sysml.lops.Checkpoint) BroadcastObject(org.apache.sysml.runtime.instructions.spark.data.BroadcastObject)

Example 5 with PartitionedBlock

use of org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock in project incubator-systemml by apache.

the class RDDSortUtils method sortDataByValMemSort.

/**
 * This function collects and sorts value column in memory and then broadcasts it.
 *
 * @param val value as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param data data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 * @param asc if true, sort ascending
 * @param rlen number of rows
 * @param clen number of columns
 * @param brlen number of rows in a block
 * @param bclen number of columns in a block
 * @param sec spark execution context
 * @param r_op reorg operator
 * @return data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
 */
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByValMemSort(JavaPairRDD<MatrixIndexes, MatrixBlock> val, JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int brlen, int bclen, SparkExecutionContext sec, ReorgOperator r_op) {
    // collect orderby column for in-memory sorting
    MatrixBlock inMatBlock = SparkExecutionContext.toMatrixBlock(val, (int) rlen, 1, brlen, bclen, -1);
    // in-memory sort operation (w/ index return: source index in target position)
    ReorgOperator lrop = new ReorgOperator(new SortIndex(1, !asc, true));
    MatrixBlock sortedIx = (MatrixBlock) inMatBlock.reorgOperations(lrop, new MatrixBlock(), -1, -1, -1);
    // flip sort indices from <source ix in target pos> to <target ix in source pos>
    MatrixBlock sortedIxSrc = new MatrixBlock(sortedIx.getNumRows(), 1, false);
    for (int i = 0; i < sortedIx.getNumRows(); i++) sortedIxSrc.quickSetValue((int) sortedIx.quickGetValue(i, 0) - 1, 0, i + 1);
    // broadcast index vector
    PartitionedBlock<MatrixBlock> pmb = new PartitionedBlock<>(sortedIxSrc, brlen, bclen);
    Broadcast<PartitionedBlock<MatrixBlock>> _pmb = sec.getSparkContext().broadcast(pmb);
    // sort data with broadcast index vector
    JavaPairRDD<MatrixIndexes, RowMatrixBlock> ret = data.mapPartitionsToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, brlen, _pmb));
    return RDDAggregateUtils.mergeRowsByKey(ret);
}
Also used : PartitionedBlock(org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) RowMatrixBlock(org.apache.sysml.runtime.instructions.spark.data.RowMatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) SortIndex(org.apache.sysml.runtime.functionobjects.SortIndex) ReorgOperator(org.apache.sysml.runtime.matrix.operators.ReorgOperator) RowMatrixBlock(org.apache.sysml.runtime.instructions.spark.data.RowMatrixBlock)

Aggregations

PartitionedBlock (org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock)7 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)5 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)4 Checkpoint (org.apache.sysml.lops.Checkpoint)3 BroadcastObject (org.apache.sysml.runtime.instructions.spark.data.BroadcastObject)3 PartitionedBroadcast (org.apache.sysml.runtime.instructions.spark.data.PartitionedBroadcast)3 Broadcast (org.apache.spark.broadcast.Broadcast)2 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)2 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)2 IsBlockInRange (org.apache.sysml.runtime.instructions.spark.functions.IsBlockInRange)2 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)2 StorageLevel (org.apache.spark.storage.StorageLevel)1 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)1 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)1 SortIndex (org.apache.sysml.runtime.functionobjects.SortIndex)1 LineageObject (org.apache.sysml.runtime.instructions.spark.data.LineageObject)1 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)1 RowMatrixBlock (org.apache.sysml.runtime.instructions.spark.data.RowMatrixBlock)1 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)1 ReorgOperator (org.apache.sysml.runtime.matrix.operators.ReorgOperator)1