use of org.apache.sysml.runtime.instructions.spark.functions.CreateSparseBlockFunction in project incubator-systemml by apache.
the class CheckpointSPInstruction method processInstruction.
@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// this is valid if relevant branches are never entered)
if (sec.getVariable(input1.getName()) == null || sec.getVariable(input1.getName()) instanceof BooleanObject) {
//add a dummy entry to the input, which will be immediately overwritten by the null output.
sec.setVariable(input1.getName(), new BooleanObject(false));
sec.setVariable(output.getName(), new BooleanObject(false));
return;
}
//get input rdd handle (for matrix or frame)
JavaPairRDD<?, ?> in = sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
// Step 2: Checkpoint given rdd (only if currently in different storage level to prevent redundancy)
// -------
// Note that persist is an transformation which will be triggered on-demand with the next rdd operations
// This prevents unnecessary overhead if the dataset is only consumed by cp operations.
JavaPairRDD<?, ?> out = null;
if (!in.getStorageLevel().equals(_level)) {
//(trigger coalesce if intended number of partitions exceeded by 20%
//and not hash partitioned to avoid losing the existing partitioner)
int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
boolean coalesce = (1.2 * numPartitions < in.getNumPartitions() && !SparkUtils.isHashPartitioned(in));
//checkpoint pre-processing rdd operations
if (coalesce) {
//merge partitions without shuffle if too many partitions
out = in.coalesce(numPartitions);
} else {
//apply a narrow shallow copy to allow for short-circuit collects
if (input1.getDataType() == DataType.MATRIX)
out = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) in, false);
else if (input1.getDataType() == DataType.FRAME)
out = ((JavaPairRDD<Long, FrameBlock>) in).mapValues(new CopyFrameBlockFunction(false));
}
//convert mcsr into memory-efficient csr if potentially sparse
if (input1.getDataType() == DataType.MATRIX && OptimizerUtils.checkSparseBlockCSRConversion(mcIn) && !_level.equals(Checkpoint.SER_STORAGE_LEVEL)) {
out = ((JavaPairRDD<MatrixIndexes, MatrixBlock>) out).mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
}
//actual checkpoint into given storage level
out = out.persist(_level);
} else {
//pass-through
out = in;
}
// Step 3: In-place update of input matrix/frame rdd handle and set as output
// -------
// We use this in-place approach for two reasons. First, it is correct because our checkpoint
// injection rewrites guarantee that after checkpoint instructions there are no consumers on the
// given input. Second, it is beneficial because otherwise we need to pass in-memory objects and
// filenames to the new matrix object in order to prevent repeated reads from hdfs and unnecessary
// caching and subsequent collects. Note that in-place update requires us to explicitly handle
// lineage information in order to prevent cycles on cleanup.
CacheableData<?> cd = sec.getCacheableData(input1.getName());
if (out != in) {
//prevent unnecessary lineage info
//guaranteed to exist (see above)
RDDObject inro = cd.getRDDHandle();
//create new rdd object
RDDObject outro = new RDDObject(out, output.getName());
//mark as checkpointed
outro.setCheckpointRDD(true);
//keep lineage to prevent cycles on cleanup
outro.addLineageChild(inro);
cd.setRDDHandle(outro);
}
sec.setVariable(output.getName(), cd);
}
use of org.apache.sysml.runtime.instructions.spark.functions.CreateSparseBlockFunction in project incubator-systemml by apache.
the class SparkExecutionContext method repartitionAndCacheMatrixObject.
@SuppressWarnings("unchecked")
public void repartitionAndCacheMatrixObject(String var) throws DMLRuntimeException {
MatrixObject mo = getMatrixObject(var);
MatrixCharacteristics mcIn = mo.getMatrixCharacteristics();
//double check size to avoid unnecessary spark context creation
if (!OptimizerUtils.exceedsCachingThreshold(mo.getNumColumns(), (double) OptimizerUtils.estimateSizeExactSparsity(mcIn)))
return;
//get input rdd and default storage level
JavaPairRDD<MatrixIndexes, MatrixBlock> in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) getRDDHandleForMatrixObject(mo, InputInfo.BinaryBlockInputInfo);
//avoid unnecessary caching of input in order to reduce memory pressure
if (mo.getRDDHandle().allowsShortCircuitRead() && isRDDMarkedForCaching(in.id()) && !isRDDCached(in.id())) {
in = (JavaPairRDD<MatrixIndexes, MatrixBlock>) ((RDDObject) mo.getRDDHandle().getLineageChilds().get(0)).getRDD();
//investigate issue of unnecessarily large number of partitions
int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
if (numPartitions < in.getNumPartitions())
in = in.coalesce(numPartitions);
}
//repartition rdd (force creation of shuffled rdd via merge), note: without deep copy albeit
//executed on the original data, because there will be no merge, i.e., no key duplicates
JavaPairRDD<MatrixIndexes, MatrixBlock> out = RDDAggregateUtils.mergeByKey(in, false);
//convert mcsr into memory-efficient csr if potentially sparse
if (OptimizerUtils.checkSparseBlockCSRConversion(mcIn)) {
out = out.mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
}
//persist rdd in default storage level
out.persist(Checkpoint.DEFAULT_STORAGE_LEVEL).count();
//create new rdd handle, in-place of current matrix object
//guaranteed to exist (see above)
RDDObject inro = mo.getRDDHandle();
//create new rdd object
RDDObject outro = new RDDObject(out, var);
//mark as checkpointed
outro.setCheckpointRDD(true);
//keep lineage to prevent cycles on cleanup
outro.addLineageChild(inro);
mo.setRDDHandle(outro);
}
Aggregations