Search in sources :

Example 16 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class SparkExecutionContext method addLineageRDD.

// /////////////////////////////////////////
// Cleanup of RDDs and Broadcast variables
// /////
/**
 * Adds a child rdd object to the lineage of a parent rdd.
 *
 * @param varParent parent variable
 * @param varChild child variable
 */
public void addLineageRDD(String varParent, String varChild) {
    RDDObject parent = getCacheableData(varParent).getRDDHandle();
    RDDObject child = getCacheableData(varChild).getRDDHandle();
    parent.addLineageChild(child);
}
Also used : RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject)

Example 17 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class MLContextConversionUtil method javaRDDStringIJVToFrameObject.

/**
 * Convert a {@code JavaRDD<String>} in IJV format to a {@code FrameObject}
 * . Note that metadata is required for IJV format.
 *
 * @param javaRDD
 *            the Java RDD of strings
 * @param frameMetadata
 *            frame metadata
 * @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
 */
public static FrameObject javaRDDStringIJVToFrameObject(JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
    JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
    MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
    FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MetaDataFormat(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
    JavaPairRDD<Long, FrameBlock> rdd;
    try {
        ValueType[] lschema = null;
        if (lschema == null)
            lschema = UtilFunctions.nCopies((int) mc.getCols(), ValueType.STRING);
        rdd = FrameRDDConverterUtils.textCellToBinaryBlock(jsc(), javaPairRDDText, mc, lschema);
    } catch (DMLRuntimeException e) {
        e.printStackTrace();
        return null;
    }
    frameObject.setRDDHandle(new RDDObject(rdd));
    return frameObject;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) ValueType(org.apache.sysml.parser.Expression.ValueType) Text(org.apache.hadoop.io.Text) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) ConvertStringToLongTextPair(org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) LongWritable(org.apache.hadoop.io.LongWritable)

Example 18 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class FrameObject method readBlobFromRDD.

@Override
protected FrameBlock readBlobFromRDD(RDDObject rdd, MutableBoolean status) throws IOException {
    // note: the read of a frame block from an RDD might trigger
    // lazy evaluation of pending transformations.
    RDDObject lrdd = rdd;
    // prepare return status (by default only collect)
    status.setValue(false);
    MetaDataFormat iimd = (MetaDataFormat) _metaData;
    MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
    int rlen = (int) mc.getRows();
    int clen = (int) mc.getCols();
    // handle missing schema if necessary
    ValueType[] lschema = (_schema != null) ? _schema : UtilFunctions.nCopies(clen >= 1 ? (int) clen : 1, ValueType.STRING);
    FrameBlock fb = null;
    try {
        // prevent unnecessary collect through rdd checkpoint
        if (rdd.allowsShortCircuitCollect()) {
            lrdd = (RDDObject) rdd.getLineageChilds().get(0);
        }
        // collect frame block from binary block RDD
        fb = SparkExecutionContext.toFrameBlock(lrdd, lschema, rlen, clen);
    } catch (DMLRuntimeException ex) {
        throw new IOException(ex);
    }
    // sanity check correct output
    if (fb == null)
        throw new IOException("Unable to load frame from rdd.");
    return fb;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) ValueType(org.apache.sysml.parser.Expression.ValueType) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 19 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class MatrixObject method readBlobFromRDD.

@Override
protected MatrixBlock readBlobFromRDD(RDDObject rdd, MutableBoolean writeStatus) throws IOException {
    // note: the read of a matrix block from an RDD might trigger
    // lazy evaluation of pending transformations.
    RDDObject lrdd = rdd;
    // prepare return status (by default only collect)
    writeStatus.setValue(false);
    MetaDataFormat iimd = (MetaDataFormat) _metaData;
    MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
    InputInfo ii = iimd.getInputInfo();
    MatrixBlock mb = null;
    try {
        // prevent unnecessary collect through rdd checkpoint
        if (rdd.allowsShortCircuitCollect()) {
            lrdd = (RDDObject) rdd.getLineageChilds().get(0);
        }
        // obtain matrix block from RDD
        int rlen = (int) mc.getRows();
        int clen = (int) mc.getCols();
        int brlen = (int) mc.getRowsPerBlock();
        int bclen = (int) mc.getColsPerBlock();
        long nnz = mc.getNonZerosBound();
        // guarded rdd collect
        if (// guarded collect not for binary cell
        ii == InputInfo.BinaryBlockInputInfo && !OptimizerUtils.checkSparkCollectMemoryBudget(mc, getPinnedSize() + getBroadcastSize(), true)) {
            // note: lazy, partition-at-a-time collect (toLocalIterator) was significantly slower
            if (!MapReduceTool.existsFileOnHDFS(_hdfsFileName)) {
                // prevent overwrite existing file
                long newnnz = SparkExecutionContext.writeRDDtoHDFS(lrdd, _hdfsFileName, iimd.getOutputInfo());
                _metaData.getMatrixCharacteristics().setNonZeros(newnnz);
                // mark rdd as non-pending (for export)
                ((RDDObject) rdd).setPending(false);
                // mark rdd as hdfs file (for restore)
                ((RDDObject) rdd).setHDFSFile(true);
                // mark for no cache-write on read
                writeStatus.setValue(true);
            // note: the flag hdfsFile is actually not entirely correct because we still hold an rdd
            // reference to the input not to an rdd of the hdfs file but the resulting behavior is correct
            }
            mb = readBlobFromHDFS(_hdfsFileName);
        } else if (ii == InputInfo.BinaryCellInputInfo) {
            // collect matrix block from binary block RDD
            mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, nnz);
        } else {
            // collect matrix block from binary cell RDD
            mb = SparkExecutionContext.toMatrixBlock(lrdd, rlen, clen, brlen, bclen, nnz);
        }
    } catch (DMLRuntimeException ex) {
        throw new IOException(ex);
    }
    // sanity check correct output
    if (mb == null)
        throw new IOException("Unable to load matrix from rdd.");
    return mb;
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) IOException(java.io.IOException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 20 with RDDObject

use of org.apache.sysml.runtime.instructions.spark.data.RDDObject in project incubator-systemml by apache.

the class SparkExecutionContext method addLineageBroadcast.

/**
 * Adds a child broadcast object to the lineage of a parent rdd.
 *
 * @param varParent parent variable
 * @param varChild child variable
 */
public void addLineageBroadcast(String varParent, String varChild) {
    RDDObject parent = getCacheableData(varParent).getRDDHandle();
    BroadcastObject<?> child = getCacheableData(varChild).getBroadcastHandle();
    parent.addLineageChild(child);
}
Also used : RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject)

Aggregations

RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)31 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)22 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)13 LongWritable (org.apache.hadoop.io.LongWritable)11 Text (org.apache.hadoop.io.Text)11 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)11 CopyTextInputFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction)10 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)10 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)9 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)9 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)9 ConvertStringToLongTextPair (org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair)8 ValueType (org.apache.sysml.parser.Expression.ValueType)7 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)7 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)7 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)6 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)6 IOException (java.io.IOException)4 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)4 Path (org.apache.hadoop.fs.Path)3