use of org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock in project incubator-systemml by apache.
the class PMapmmSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// get inputs
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(input1.getName());
// This avoids errors such as java.lang.UnsupportedOperationException: Cannot change storage level of an RDD after it was already assigned a level
// Ideally, we should ensure that we donot redundantly call persist on the same RDD.
StorageLevel pmapmmStorageLevel = StorageLevel.MEMORY_AND_DISK();
// cache right hand side because accessed many times
in2 = in2.repartition(sec.getSparkContext().defaultParallelism()).persist(pmapmmStorageLevel);
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
for (int i = 0; i < mc1.getRows(); i += NUM_ROWBLOCKS * mc1.getRowsPerBlock()) {
// create broadcast for rdd partition
JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = in1.filter(new IsBlockInRange(i + 1, i + NUM_ROWBLOCKS * mc1.getRowsPerBlock(), 1, mc1.getCols(), mc1)).mapToPair(new PMapMMRebaseBlocksFunction(i / mc1.getRowsPerBlock()));
int rlen = (int) Math.min(mc1.getRows() - i, NUM_ROWBLOCKS * mc1.getRowsPerBlock());
PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(rdd, rlen, (int) mc1.getCols(), mc1.getRowsPerBlock(), mc1.getColsPerBlock(), -1L);
Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
// matrix multiplication
JavaPairRDD<MatrixIndexes, MatrixBlock> rdd2 = in2.flatMapToPair(new PMapMMFunction(bpmb, i / mc1.getRowsPerBlock()));
rdd2 = RDDAggregateUtils.sumByKeyStable(rdd2, false);
rdd2.persist(pmapmmStorageLevel).count();
bpmb.unpersist(false);
if (out == null)
out = rdd2;
else
out = out.union(rdd2);
}
// cache final result
out = out.persist(pmapmmStorageLevel);
out.count();
// put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
sec.addLineageRDD(output.getName(), input2.getName());
// update output statistics if not inferred
updateBinaryMMOutputMatrixCharacteristics(sec, true);
}
use of org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock in project incubator-systemml by apache.
the class Tsmm2SPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// get input
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
// execute tsmm2 instruction
// step 1: first pass of X, filter-collect-broadcast excess blocks
JavaPairRDD<MatrixIndexes, MatrixBlock> tmp1 = in.filter(new IsBlockInRange(_type.isLeft() ? 1 : mc.getRowsPerBlock() + 1, mc.getRows(), _type.isLeft() ? mc.getColsPerBlock() + 1 : 1, mc.getCols(), mc)).mapToPair(new ShiftTSMMIndexesFunction(_type));
PartitionedBlock<MatrixBlock> pmb = SparkExecutionContext.toPartitionedMatrixBlock(tmp1, (int) (_type.isLeft() ? mc.getRows() : mc.getRows() - mc.getRowsPerBlock()), (int) (_type.isLeft() ? mc.getCols() - mc.getColsPerBlock() : mc.getCols()), mc.getRowsPerBlock(), mc.getColsPerBlock(), -1L);
Broadcast<PartitionedBlock<MatrixBlock>> bpmb = sec.getSparkContext().broadcast(pmb);
// step 2: second pass of X, compute tsmm/mapmm and aggregate result blocks
int outputDim = (int) (_type.isLeft() ? mc.getCols() : mc.getRows());
if (OptimizerUtils.estimateSize(outputDim, outputDim) <= 32 * 1024 * 1024) {
// default: <=32MB
// output large blocks and reduceAll to avoid skew on combineByKey
JavaRDD<MatrixBlock> tmp2 = in.map(new RDDTSMM2ExtFunction(bpmb, _type, outputDim, (int) mc.getRowsPerBlock()));
MatrixBlock out = RDDAggregateUtils.sumStable(tmp2);
// put output block into symbol table (no lineage because single block)
// this also includes implicit maintenance of matrix characteristics
sec.setMatrixOutput(output.getName(), out, getExtendedOpcode());
} else {
// output individual output blocks and aggregate by key (no action)
JavaPairRDD<MatrixIndexes, MatrixBlock> tmp2 = in.flatMapToPair(new RDDTSMM2Function(bpmb, _type));
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDAggregateUtils.sumByKeyStable(tmp2, false);
// put output RDD handle into symbol table
sec.getMatrixCharacteristics(output.getName()).set(outputDim, outputDim, mc.getRowsPerBlock(), mc.getColsPerBlock());
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
}
}
use of org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock in project incubator-systemml by apache.
the class SparkExecutionContext method toPartitionedMatrixBlock.
public static PartitionedBlock<MatrixBlock> toPartitionedMatrixBlock(JavaPairRDD<MatrixIndexes, MatrixBlock> rdd, int rlen, int clen, int brlen, int bclen, long nnz) {
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
PartitionedBlock<MatrixBlock> out = new PartitionedBlock<>(rlen, clen, brlen, bclen);
List<Tuple2<MatrixIndexes, MatrixBlock>> list = rdd.collect();
// copy blocks one-at-a-time into output matrix block
for (Tuple2<MatrixIndexes, MatrixBlock> keyval : list) {
// unpack index-block pair
MatrixIndexes ix = keyval._1();
MatrixBlock block = keyval._2();
out.setBlock((int) ix.getRowIndex(), (int) ix.getColumnIndex(), block);
}
if (DMLScript.STATISTICS) {
Statistics.accSparkCollectTime(System.nanoTime() - t0);
Statistics.incSparkCollectCount(1);
}
return out;
}
use of org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock in project incubator-systemml by apache.
the class SparkExecutionContext method rCleanupLineageObject.
@SuppressWarnings({ "rawtypes", "unchecked" })
private void rCleanupLineageObject(LineageObject lob) throws IOException {
// abort recursive cleanup if still consumers
if (lob.getNumReferences() > 0)
return;
// robustness in function calls and to prevent repeated scans of the symbol table)
if (lob.hasBackReference())
return;
// incl deferred hdfs file removal (only if metadata set by cleanup call)
if (lob instanceof RDDObject) {
RDDObject rdd = (RDDObject) lob;
int rddID = rdd.getRDD().id();
cleanupRDDVariable(rdd.getRDD());
if (rdd.getHDFSFilename() != null) {
// deferred file removal
MapReduceTool.deleteFileWithMTDIfExistOnHDFS(rdd.getHDFSFilename());
}
if (rdd.isParallelizedRDD())
_parRDDs.deregisterRDD(rddID);
} else if (lob instanceof BroadcastObject) {
PartitionedBroadcast pbm = ((BroadcastObject) lob).getBroadcast();
if (// robustness for evictions
pbm != null)
for (Broadcast<PartitionedBlock> bc : pbm.getBroadcasts()) cleanupBroadcastVariable(bc);
CacheableData.addBroadcastSize(-((BroadcastObject) lob).getSize());
}
// recursively process lineage children
for (LineageObject c : lob.getLineageChilds()) {
c.decrementNumReferences();
rCleanupLineageObject(c);
}
}
use of org.apache.sysml.runtime.instructions.spark.data.PartitionedBlock in project incubator-systemml by apache.
the class RDDSortUtils method sortDataByValMemSort.
/**
* This function collects and sorts value column in memory and then broadcasts it.
*
* @param val value as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
* @param data data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
* @param asc if true, sort ascending
* @param rlen number of rows
* @param clen number of columns
* @param brlen number of rows in a block
* @param bclen number of columns in a block
* @param sec spark execution context
* @param r_op reorg operator
* @return data as {@code JavaPairRDD<MatrixIndexes, MatrixBlock>}
*/
public static JavaPairRDD<MatrixIndexes, MatrixBlock> sortDataByValMemSort(JavaPairRDD<MatrixIndexes, MatrixBlock> val, JavaPairRDD<MatrixIndexes, MatrixBlock> data, boolean asc, long rlen, long clen, int brlen, int bclen, SparkExecutionContext sec, ReorgOperator r_op) {
// collect orderby column for in-memory sorting
MatrixBlock inMatBlock = SparkExecutionContext.toMatrixBlock(val, (int) rlen, 1, brlen, bclen, -1);
// in-memory sort operation (w/ index return: source index in target position)
ReorgOperator lrop = new ReorgOperator(new SortIndex(1, !asc, true));
MatrixBlock sortedIx = (MatrixBlock) inMatBlock.reorgOperations(lrop, new MatrixBlock(), -1, -1, -1);
// flip sort indices from <source ix in target pos> to <target ix in source pos>
MatrixBlock sortedIxSrc = new MatrixBlock(sortedIx.getNumRows(), 1, false);
for (int i = 0; i < sortedIx.getNumRows(); i++) sortedIxSrc.quickSetValue((int) sortedIx.quickGetValue(i, 0) - 1, 0, i + 1);
// broadcast index vector
PartitionedBlock<MatrixBlock> pmb = new PartitionedBlock<>(sortedIxSrc, brlen, bclen);
Broadcast<PartitionedBlock<MatrixBlock>> _pmb = sec.getSparkContext().broadcast(pmb);
// sort data with broadcast index vector
JavaPairRDD<MatrixIndexes, RowMatrixBlock> ret = data.mapPartitionsToPair(new ShuffleMatrixBlockRowsInMemFunction(rlen, brlen, _pmb));
return RDDAggregateUtils.mergeRowsByKey(ret);
}
Aggregations