Search in sources :

Example 51 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project systemml by apache.

the class SparkExecutionContext method setRDDHandleForVariable.

/**
 * Keep the output rdd of spark rdd operations as meta data of matrix/frame
 * objects in the symbol table.
 *
 * @param varname variable name
 * @param rdd JavaPairRDD handle for variable
 */
public void setRDDHandleForVariable(String varname, JavaPairRDD<?, ?> rdd) {
    CacheableData<?> obj = getCacheableData(varname);
    RDDObject rddhandle = new RDDObject(rdd);
    obj.setRDDHandle(rddhandle);
}
Also used : RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject)

Example 52 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project systemml by apache.

the class SparkExecutionContext method getRDDHandleForMatrixObject.

@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForMatrixObject(MatrixObject mo, InputInfo inputInfo, int numParts) {
    // NOTE: MB this logic should be integrated into MatrixObject
    // However, for now we cannot assume that spark libraries are
    // always available and hence only store generic references in
    // matrix object while all the logic is in the SparkExecContext
    JavaSparkContext sc = getSparkContext();
    JavaPairRDD<?, ?> rdd = null;
    // rdd operations if already executed and cached
    if (mo.getRDDHandle() != null && (mo.getRDDHandle().isCheckpointRDD() || !mo.isCached(false))) {
        // return existing rdd handling (w/o input format change)
        rdd = mo.getRDDHandle().getRDD();
    } else // CASE 2: dirty in memory data or cached result of rdd operations
    if (mo.isDirty() || mo.isCached(false)) {
        // get in-memory matrix block and parallelize it
        // w/ guarded parallelize (fallback to export, rdd from file if too large)
        MatrixCharacteristics mc = mo.getMatrixCharacteristics();
        boolean fromFile = false;
        if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
            if (// write if necessary
            mo.isDirty() || !mo.isHDFSFileExists())
                mo.exportData();
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // cp is workaround for read bug
            rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
            fromFile = true;
        } else {
            // default case
            // pin matrix in memory
            MatrixBlock mb = mo.acquireRead();
            rdd = toMatrixJavaPairRDD(sc, mb, (int) mo.getNumRowsPerBlock(), (int) mo.getNumColumnsPerBlock(), numParts);
            // unpin matrix
            mo.release();
            _parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(fromFile);
        rddhandle.setParallelizedRDD(!fromFile);
        mo.setRDDHandle(rddhandle);
    } else // CASE 3: non-dirty (file exists on HDFS)
    {
        // For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
        if (inputInfo == InputInfo.BinaryBlockInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
            // recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
            // cp is workaround for read bug
            rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
        } else if (inputInfo == InputInfo.TextCellInputInfo || inputInfo == InputInfo.CSVInputInfo || inputInfo == InputInfo.MatrixMarketInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
        } else if (inputInfo == InputInfo.BinaryCellInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            // cp is workaround for read bug
            rdd = ((JavaPairRDD<MatrixIndexes, MatrixCell>) rdd).mapToPair(new CopyBinaryCellFunction());
        } else {
            throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
        }
        // keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd);
        rddhandle.setHDFSFile(true);
        mo.setRDDHandle(rddhandle);
    }
    return rdd;
}
Also used : CopyBinaryCellFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyBinaryCellFunction) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CompressedMatrixBlock(org.apache.sysml.runtime.compress.CompressedMatrixBlock) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 53 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project systemml by apache.

the class MLContextConversionUtil method javaRDDStringIJVToMatrixObject.

/**
 * Convert a {@code JavaRDD<String>} in IJV format to a {@code MatrixObject}
 * . Note that metadata is required for IJV format.
 *
 * @param javaRDD
 *            the Java RDD of strings
 * @param matrixMetadata
 *            matrix metadata
 * @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
 */
public static MatrixObject javaRDDStringIJVToMatrixObject(JavaRDD<String> javaRDD, MatrixMetadata matrixMetadata) {
    JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
    MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.TextCellOutputInfo, InputInfo.TextCellInputInfo));
    JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
    matrixObject.setRDDHandle(new RDDObject(javaPairRDD2));
    return matrixObject;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ConvertStringToLongTextPair(org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Aggregations

RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)53 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)37 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)22 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)18 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)18 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)18 LongWritable (org.apache.hadoop.io.LongWritable)17 Text (org.apache.hadoop.io.Text)17 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)16 CopyTextInputFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction)16 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)16 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)13 ConvertStringToLongTextPair (org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair)12 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)12 ValueType (org.apache.sysml.parser.Expression.ValueType)11 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)9 IOException (java.io.IOException)7 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)6 Checkpoint (org.apache.sysml.lops.Checkpoint)6 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)6