Search in sources :

Example 1 with CopyBinaryCellFunction

use of org.apache.sysml.runtime.instructions.spark.functions.CopyBinaryCellFunction in project incubator-systemml by apache.

the class SparkExecutionContext method getRDDHandleForMatrixObject.

/**
	 * This call returns an RDD handle for a given matrix object. This includes
	 * the creation of RDDs for in-memory or binary-block HDFS data.
	 *
	 * @param mo matrix object
	 * @param inputInfo input info
	 * @return JavaPairRDD handle for a matrix object
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForMatrixObject(MatrixObject mo, InputInfo inputInfo) throws DMLRuntimeException {
    //NOTE: MB this logic should be integrated into MatrixObject
    //However, for now we cannot assume that spark libraries are
    //always available and hence only store generic references in
    //matrix object while all the logic is in the SparkExecContext
    JavaSparkContext sc = getSparkContext();
    JavaPairRDD<?, ?> rdd = null;
    //rdd operations if already executed and cached
    if (mo.getRDDHandle() != null && (mo.getRDDHandle().isCheckpointRDD() || !mo.isCached(false))) {
        //return existing rdd handling (w/o input format change)
        rdd = mo.getRDDHandle().getRDD();
    } else //CASE 2: dirty in memory data or cached result of rdd operations
    if (mo.isDirty() || mo.isCached(false)) {
        //get in-memory matrix block and parallelize it
        //w/ guarded parallelize (fallback to export, rdd from file if too large)
        MatrixCharacteristics mc = mo.getMatrixCharacteristics();
        boolean fromFile = false;
        if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
            if (//write if necessary
            mo.isDirty() || !mo.isHDFSFileExists())
                mo.exportData();
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            //cp is workaround for read bug
            rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
            fromFile = true;
        } else {
            //default case
            //pin matrix in memory
            MatrixBlock mb = mo.acquireRead();
            rdd = toMatrixJavaPairRDD(sc, mb, (int) mo.getNumRowsPerBlock(), (int) mo.getNumColumnsPerBlock());
            //unpin matrix
            mo.release();
            _parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
        }
        //keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd, mo.getVarName());
        rddhandle.setHDFSFile(fromFile);
        mo.setRDDHandle(rddhandle);
    } else //CASE 3: non-dirty (file exists on HDFS)
    {
        // For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
        if (inputInfo == InputInfo.BinaryBlockInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            //note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
            //recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
            //cp is workaround for read bug
            rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
        } else if (inputInfo == InputInfo.TextCellInputInfo || inputInfo == InputInfo.CSVInputInfo || inputInfo == InputInfo.MatrixMarketInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            //cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
        } else if (inputInfo == InputInfo.BinaryCellInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            //cp is workaround for read bug
            rdd = ((JavaPairRDD<MatrixIndexes, MatrixCell>) rdd).mapToPair(new CopyBinaryCellFunction());
        } else {
            throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
        }
        //keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd, mo.getVarName());
        rddhandle.setHDFSFile(true);
        mo.setRDDHandle(rddhandle);
    }
    return rdd;
}
Also used : CopyBinaryCellFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyBinaryCellFunction) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

LongWritable (org.apache.hadoop.io.LongWritable)1 Text (org.apache.hadoop.io.Text)1 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)1 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)1 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)1 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)1 CopyBinaryCellFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyBinaryCellFunction)1 CopyTextInputFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction)1 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)1 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)1 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)1