use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project systemml by apache.
the class SparkExecutionContext method setRDDHandleForVariable.
/**
* Keep the output rdd of spark rdd operations as meta data of matrix/frame
* objects in the symbol table.
*
* @param varname variable name
* @param rdd JavaPairRDD handle for variable
*/
public void setRDDHandleForVariable(String varname, JavaPairRDD<?, ?> rdd) {
CacheableData<?> obj = getCacheableData(varname);
RDDObject rddhandle = new RDDObject(rdd);
obj.setRDDHandle(rddhandle);
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project systemml by apache.
the class SparkExecutionContext method getRDDHandleForMatrixObject.
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForMatrixObject(MatrixObject mo, InputInfo inputInfo, int numParts) {
// NOTE: MB this logic should be integrated into MatrixObject
// However, for now we cannot assume that spark libraries are
// always available and hence only store generic references in
// matrix object while all the logic is in the SparkExecContext
JavaSparkContext sc = getSparkContext();
JavaPairRDD<?, ?> rdd = null;
// rdd operations if already executed and cached
if (mo.getRDDHandle() != null && (mo.getRDDHandle().isCheckpointRDD() || !mo.isCached(false))) {
// return existing rdd handling (w/o input format change)
rdd = mo.getRDDHandle().getRDD();
} else // CASE 2: dirty in memory data or cached result of rdd operations
if (mo.isDirty() || mo.isCached(false)) {
// get in-memory matrix block and parallelize it
// w/ guarded parallelize (fallback to export, rdd from file if too large)
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
boolean fromFile = false;
if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
if (// write if necessary
mo.isDirty() || !mo.isHDFSFileExists())
mo.exportData();
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
// cp is workaround for read bug
rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
fromFile = true;
} else {
// default case
// pin matrix in memory
MatrixBlock mb = mo.acquireRead();
rdd = toMatrixJavaPairRDD(sc, mb, (int) mo.getNumRowsPerBlock(), (int) mo.getNumColumnsPerBlock(), numParts);
// unpin matrix
mo.release();
_parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
}
// keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd);
rddhandle.setHDFSFile(fromFile);
rddhandle.setParallelizedRDD(!fromFile);
mo.setRDDHandle(rddhandle);
} else // CASE 3: non-dirty (file exists on HDFS)
{
// For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
if (inputInfo == InputInfo.BinaryBlockInputInfo) {
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
// note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
// recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
// cp is workaround for read bug
rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
} else if (inputInfo == InputInfo.TextCellInputInfo || inputInfo == InputInfo.CSVInputInfo || inputInfo == InputInfo.MatrixMarketInputInfo) {
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
// cp is workaround for read bug
rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
} else if (inputInfo == InputInfo.BinaryCellInputInfo) {
rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
// cp is workaround for read bug
rdd = ((JavaPairRDD<MatrixIndexes, MatrixCell>) rdd).mapToPair(new CopyBinaryCellFunction());
} else {
throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
}
// keep rdd handle for future operations on it
RDDObject rddhandle = new RDDObject(rdd);
rddhandle.setHDFSFile(true);
mo.setRDDHandle(rddhandle);
}
return rdd;
}
use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project systemml by apache.
the class MLContextConversionUtil method javaRDDStringIJVToMatrixObject.
/**
* Convert a {@code JavaRDD<String>} in IJV format to a {@code MatrixObject}
* . Note that metadata is required for IJV format.
*
* @param javaRDD
* the Java RDD of strings
* @param matrixMetadata
* matrix metadata
* @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
*/
public static MatrixObject javaRDDStringIJVToMatrixObject(JavaRDD<String> javaRDD, MatrixMetadata matrixMetadata) {
JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.TextCellOutputInfo, InputInfo.TextCellInputInfo));
JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
matrixObject.setRDDHandle(new RDDObject(javaPairRDD2));
return matrixObject;
}
Aggregations