use of org.apache.sysml.runtime.instructions.spark.functions.CopyBinaryCellFunction in project incubator-systemml by apache.
the class SparkExecutionContext method getRDDHandleForMatrixObject.
/**
* This call returns an RDD handle for a given matrix object. This includes
* the creation of RDDs for in-memory or binary-block HDFS data.
*
* @param mo matrix object
* @param inputInfo input info
* @return JavaPairRDD handle for a matrix object
* @throws DMLRuntimeException if DMLRuntimeException occurs
*/
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForMatrixObject(MatrixObject mo, InputInfo inputInfo) throws DMLRuntimeException {
//NOTE: MB this logic should be integrated into MatrixObject
//However, for now we cannot assume that spark libraries are
//always available and hence only store generic references in
//matrix object while all the logic is in the SparkExecContext
JavaSparkContext sc = getSparkContext();
JavaPairRDD<?, ?> rdd = null;
//rdd operations if already executed and cached
if (mo.getRDDHandle() != null && (mo.getRDDHandle().isCheckpointRDD() || !mo.isCached(false))) {
//return existing rdd handling (w/o input format change)
rdd = mo.getRDDHandle().getRDD();
} else //CASE 2: dirty in memory data or cached result of rdd operations
if (mo.isDirty() || mo.isCached(false)) {
//get in-memory matrix block and parallelize it
//w/ guarded parallelize (fallback to export, rdd from file if too large)
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
boolean fromFile = false;
if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
if (//write if necessary
mo.isDirty() || !mo.isHDFSFileExists())
mo.exportData();
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
//cp is workaround for read bug
rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
fromFile = true;
} else {
//default case
//pin matrix in memory
MatrixBlock mb = mo.acquireRead();
rdd = toMatrixJavaPairRDD(sc, mb, (int) mo.getNumRowsPerBlock(), (int) mo.getNumColumnsPerBlock());
//unpin matrix
mo.release();
_parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
}
//keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd, mo.getVarName());
rddhandle.setHDFSFile(fromFile);
mo.setRDDHandle(rddhandle);
} else //CASE 3: non-dirty (file exists on HDFS)
{
// For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
if (inputInfo == InputInfo.BinaryBlockInputInfo) {
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
//note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
//recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
//cp is workaround for read bug
rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
} else if (inputInfo == InputInfo.TextCellInputInfo || inputInfo == InputInfo.CSVInputInfo || inputInfo == InputInfo.MatrixMarketInputInfo) {
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
//cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
} else if (inputInfo == InputInfo.BinaryCellInputInfo) {
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
//cp is workaround for read bug
rdd = ((JavaPairRDD<MatrixIndexes, MatrixCell>) rdd).mapToPair(new CopyBinaryCellFunction());
} else {
throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
}
//keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd, mo.getVarName());
rddhandle.setHDFSFile(true);
mo.setRDDHandle(rddhandle);
}
return rdd;
}
Aggregations