use of org.apache.sysml.lops.Checkpoint in project systemml by apache.
the class CheckpointSPInstruction method processInstruction.
@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// this is valid if relevant branches are never entered)
if (sec.getVariable(input1.getName()) == null || sec.getVariable(input1.getName()) instanceof BooleanObject) {
// add a dummy entry to the input, which will be immediately overwritten by the null output.
sec.setVariable(input1.getName(), new BooleanObject(false));
sec.setVariable(output.getName(), new BooleanObject(false));
return;
}
// -------
// (for csv input files with unknown dimensions, we might have generated a checkpoint after
// csvreblock although not necessary because the csvreblock was subject to in-memory reblock)
CacheableData<?> obj = sec.getCacheableData(input1.getName());
if (obj.isCached(true)) {
// available in memory
sec.setVariable(output.getName(), obj);
return;
}
// get input rdd handle (for matrix or frame)
JavaPairRDD<?, ?> in = sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
// Step 2: Checkpoint given rdd (only if currently in different storage level to prevent redundancy)
// -------
// Note that persist is an transformation which will be triggered on-demand with the next rdd operations
// This prevents unnecessary overhead if the dataset is only consumed by cp operations.
JavaPairRDD<?, ?> out = null;
if (!in.getStorageLevel().equals(_level)) {
// (trigger coalesce if intended number of partitions exceeded by 20%
// and not hash partitioned to avoid losing the existing partitioner)
int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
boolean coalesce = (1.2 * numPartitions < in.getNumPartitions() && !SparkUtils.isHashPartitioned(in) && in.getNumPartitions() > SparkExecutionContext.getDefaultParallelism(true));
// checkpoint pre-processing rdd operations
if (coalesce) {
// merge partitions without shuffle if too many partitions
out = in.coalesce(numPartitions);
} else {
// apply a narrow shallow copy to allow for short-circuit collects
if (input1.getDataType() == DataType.MATRIX)
out = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) in, false);
else if (input1.getDataType() == DataType.FRAME)
out = ((JavaPairRDD<Long, FrameBlock>) in).mapValues(new CopyFrameBlockFunction(false));
}
// convert mcsr into memory-efficient csr if potentially sparse
if (input1.getDataType() == DataType.MATRIX && OptimizerUtils.checkSparseBlockCSRConversion(mcIn) && !_level.equals(Checkpoint.SER_STORAGE_LEVEL)) {
out = ((JavaPairRDD<MatrixIndexes, MatrixBlock>) out).mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
}
// actual checkpoint into given storage level
out = out.persist(_level);
// otherwise these their nnz would never be evaluated due to lazy evaluation in spark
if (input1.isMatrix() && mcIn.dimsKnown() && !mcIn.dimsKnown(true) && !OptimizerUtils.isValidCPDimensions(mcIn)) {
mcIn.setNonZeros(SparkUtils.getNonZeros((JavaPairRDD<MatrixIndexes, MatrixBlock>) out));
}
} else {
// pass-through
out = in;
}
// Step 3: In-place update of input matrix/frame rdd handle and set as output
// -------
// We use this in-place approach for two reasons. First, it is correct because our checkpoint
// injection rewrites guarantee that after checkpoint instructions there are no consumers on the
// given input. Second, it is beneficial because otherwise we need to pass in-memory objects and
// filenames to the new matrix object in order to prevent repeated reads from hdfs and unnecessary
// caching and subsequent collects. Note that in-place update requires us to explicitly handle
// lineage information in order to prevent cycles on cleanup.
CacheableData<?> cd = sec.getCacheableData(input1.getName());
if (out != in) {
// prevent unnecessary lineage info
// guaranteed to exist (see above)
RDDObject inro = cd.getRDDHandle();
// create new rdd object
RDDObject outro = new RDDObject(out);
// mark as checkpointed
outro.setCheckpointRDD(true);
// keep lineage to prevent cycles on cleanup
outro.addLineageChild(inro);
cd.setRDDHandle(outro);
}
sec.setVariable(output.getName(), cd);
}
use of org.apache.sysml.lops.Checkpoint in project systemml by apache.
the class Hop method constructAndSetCheckpointLopIfRequired.
private void constructAndSetCheckpointLopIfRequired() {
// determine execution type
ExecType et = ExecType.CP;
if (OptimizerUtils.isSparkExecutionMode() && getDataType() != DataType.SCALAR) {
// (2) avoid unnecessary creation of spark context (incl executors)
if ((OptimizerUtils.isHybridExecutionMode() && hasValidCPDimsAndSize() && !OptimizerUtils.exceedsCachingThreshold(getDim2(), _outputMemEstimate)) || _etypeForced == ExecType.CP) {
et = ExecType.CP;
} else // default case
{
et = ExecType.SPARK;
}
}
// add checkpoint lop to output if required
if (_requiresCheckpoint && et != ExecType.CP) {
try {
// investigate need for serialized storage of large sparse matrices
// (compile- instead of runtime-level for better debugging)
boolean serializedStorage = false;
if (getDataType() == DataType.MATRIX && dimsKnown(true)) {
double matrixPSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(_dim1, _dim2, _rows_in_block, _cols_in_block, _nnz);
double dataCache = SparkExecutionContext.getDataMemoryBudget(true, true);
serializedStorage = MatrixBlock.evalSparseFormatInMemory(_dim1, _dim2, _nnz) && // sparse in-memory does not fit in agg mem
matrixPSize > dataCache && (OptimizerUtils.getSparsity(_dim1, _dim2, _nnz) < MatrixBlock.ULTRA_SPARSITY_TURN_POINT || // ultra-sparse or sparse w/o csr
!Checkpoint.CHECKPOINT_SPARSE_CSR);
} else if (!dimsKnown(true)) {
setRequiresRecompile();
}
// construct checkpoint w/ right storage level
Lop input = getLops();
Lop chkpoint = new Checkpoint(input, getDataType(), getValueType(), serializedStorage ? Checkpoint.getSerializeStorageLevelString() : Checkpoint.getDefaultStorageLevelString());
setOutputDimensions(chkpoint);
setLineNumbers(chkpoint);
setLops(chkpoint);
} catch (LopsException ex) {
throw new HopsException(ex);
}
}
}
use of org.apache.sysml.lops.Checkpoint in project incubator-systemml by apache.
the class Hop method constructAndSetCheckpointLopIfRequired.
private void constructAndSetCheckpointLopIfRequired() {
// determine execution type
ExecType et = ExecType.CP;
if (OptimizerUtils.isSparkExecutionMode() && getDataType() != DataType.SCALAR) {
// (2) avoid unnecessary creation of spark context (incl executors)
if ((OptimizerUtils.isHybridExecutionMode() && hasValidCPDimsAndSize() && !OptimizerUtils.exceedsCachingThreshold(getDim2(), _outputMemEstimate)) || _etypeForced == ExecType.CP) {
et = ExecType.CP;
} else // default case
{
et = ExecType.SPARK;
}
}
// add checkpoint lop to output if required
if (_requiresCheckpoint && et != ExecType.CP) {
try {
// investigate need for serialized storage of large sparse matrices
// (compile- instead of runtime-level for better debugging)
boolean serializedStorage = false;
if (getDataType() == DataType.MATRIX && dimsKnown(true)) {
double matrixPSize = OptimizerUtils.estimatePartitionedSizeExactSparsity(_dim1, _dim2, _rows_in_block, _cols_in_block, _nnz);
double dataCache = SparkExecutionContext.getDataMemoryBudget(true, true);
serializedStorage = MatrixBlock.evalSparseFormatInMemory(_dim1, _dim2, _nnz) && // sparse in-memory does not fit in agg mem
matrixPSize > dataCache && (OptimizerUtils.getSparsity(_dim1, _dim2, _nnz) < MatrixBlock.ULTRA_SPARSITY_TURN_POINT || // ultra-sparse or sparse w/o csr
!Checkpoint.CHECKPOINT_SPARSE_CSR);
} else if (!dimsKnown(true)) {
setRequiresRecompile();
}
// construct checkpoint w/ right storage level
Lop input = getLops();
Lop chkpoint = new Checkpoint(input, getDataType(), getValueType(), serializedStorage ? Checkpoint.getSerializeStorageLevelString() : Checkpoint.getDefaultStorageLevelString());
setOutputDimensions(chkpoint);
setLineNumbers(chkpoint);
setLops(chkpoint);
} catch (LopsException ex) {
throw new HopsException(ex);
}
}
}
use of org.apache.sysml.lops.Checkpoint in project incubator-systemml by apache.
the class CheckpointSPInstruction method processInstruction.
@Override
@SuppressWarnings("unchecked")
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// this is valid if relevant branches are never entered)
if (sec.getVariable(input1.getName()) == null || sec.getVariable(input1.getName()) instanceof BooleanObject) {
// add a dummy entry to the input, which will be immediately overwritten by the null output.
sec.setVariable(input1.getName(), new BooleanObject(false));
sec.setVariable(output.getName(), new BooleanObject(false));
return;
}
// -------
// (for csv input files with unknown dimensions, we might have generated a checkpoint after
// csvreblock although not necessary because the csvreblock was subject to in-memory reblock)
CacheableData<?> obj = sec.getCacheableData(input1.getName());
if (obj.isCached(true)) {
// available in memory
sec.setVariable(output.getName(), obj);
return;
}
// get input rdd handle (for matrix or frame)
JavaPairRDD<?, ?> in = sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
MatrixCharacteristics mcIn = sec.getMatrixCharacteristics(input1.getName());
// Step 2: Checkpoint given rdd (only if currently in different storage level to prevent redundancy)
// -------
// Note that persist is an transformation which will be triggered on-demand with the next rdd operations
// This prevents unnecessary overhead if the dataset is only consumed by cp operations.
JavaPairRDD<?, ?> out = null;
if (!in.getStorageLevel().equals(_level)) {
// (trigger coalesce if intended number of partitions exceeded by 20%
// and not hash partitioned to avoid losing the existing partitioner)
int numPartitions = SparkUtils.getNumPreferredPartitions(mcIn, in);
boolean coalesce = (1.2 * numPartitions < in.getNumPartitions() && !SparkUtils.isHashPartitioned(in) && in.getNumPartitions() > SparkExecutionContext.getDefaultParallelism(true));
// checkpoint pre-processing rdd operations
if (coalesce) {
// merge partitions without shuffle if too many partitions
out = in.coalesce(numPartitions);
} else {
// apply a narrow shallow copy to allow for short-circuit collects
if (input1.getDataType() == DataType.MATRIX)
out = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) in, false);
else if (input1.getDataType() == DataType.FRAME)
out = ((JavaPairRDD<Long, FrameBlock>) in).mapValues(new CopyFrameBlockFunction(false));
}
// convert mcsr into memory-efficient csr if potentially sparse
if (input1.getDataType() == DataType.MATRIX && OptimizerUtils.checkSparseBlockCSRConversion(mcIn) && !_level.equals(Checkpoint.SER_STORAGE_LEVEL)) {
out = ((JavaPairRDD<MatrixIndexes, MatrixBlock>) out).mapValues(new CreateSparseBlockFunction(SparseBlock.Type.CSR));
}
// actual checkpoint into given storage level
out = out.persist(_level);
// otherwise these their nnz would never be evaluated due to lazy evaluation in spark
if (input1.isMatrix() && mcIn.dimsKnown() && !mcIn.dimsKnown(true) && !OptimizerUtils.isValidCPDimensions(mcIn)) {
mcIn.setNonZeros(SparkUtils.getNonZeros((JavaPairRDD<MatrixIndexes, MatrixBlock>) out));
}
} else {
// pass-through
out = in;
}
// Step 3: In-place update of input matrix/frame rdd handle and set as output
// -------
// We use this in-place approach for two reasons. First, it is correct because our checkpoint
// injection rewrites guarantee that after checkpoint instructions there are no consumers on the
// given input. Second, it is beneficial because otherwise we need to pass in-memory objects and
// filenames to the new matrix object in order to prevent repeated reads from hdfs and unnecessary
// caching and subsequent collects. Note that in-place update requires us to explicitly handle
// lineage information in order to prevent cycles on cleanup.
CacheableData<?> cd = sec.getCacheableData(input1.getName());
if (out != in) {
// prevent unnecessary lineage info
// guaranteed to exist (see above)
RDDObject inro = cd.getRDDHandle();
// create new rdd object
RDDObject outro = new RDDObject(out);
// mark as checkpointed
outro.setCheckpointRDD(true);
// keep lineage to prevent cycles on cleanup
outro.addLineageChild(inro);
cd.setRDDHandle(outro);
}
sec.setVariable(output.getName(), cd);
}
Aggregations