Search in sources :

Example 1 with CopyTextInputFunction

use of org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction in project incubator-systemml by apache.

the class MLContextConversionUtil method javaRDDStringCSVToFrameObject.

/**
	 * Convert a {@code JavaRDD<String>} in CSV format to a {@code FrameObject}
	 * 
	 * @param variableName
	 *            name of the variable associated with the frame
	 * @param javaRDD
	 *            the Java RDD of strings
	 * @param frameMetadata
	 *            frame metadata
	 * @return the {@code JavaRDD<String>} converted to a {@code FrameObject}
	 */
public static FrameObject javaRDDStringCSVToFrameObject(String variableName, JavaRDD<String> javaRDD, FrameMetadata frameMetadata) {
    JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
    MatrixCharacteristics mc = (frameMetadata != null) ? frameMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    JavaPairRDD<LongWritable, Text> javaPairRDDText = javaPairRDD.mapToPair(new CopyTextInputFunction());
    FrameObject frameObject = new FrameObject(OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo), frameMetadata.getFrameSchema().getSchema().toArray(new ValueType[0]));
    JavaPairRDD<Long, FrameBlock> rdd;
    try {
        rdd = FrameRDDConverterUtils.csvToBinaryBlock(jsc(), javaPairRDDText, mc, frameObject.getSchema(), false, ",", false, -1);
    } catch (DMLRuntimeException e) {
        e.printStackTrace();
        return null;
    }
    frameObject.setRDDHandle(new RDDObject(rdd, variableName));
    return frameObject;
}
Also used : ValueType(org.apache.sysml.parser.Expression.ValueType) Text(org.apache.hadoop.io.Text) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) ConvertStringToLongTextPair(org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) LongWritable(org.apache.hadoop.io.LongWritable)

Example 2 with CopyTextInputFunction

use of org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction in project incubator-systemml by apache.

the class SparkExecutionContext method getRDDHandleForMatrixObject.

/**
	 * This call returns an RDD handle for a given matrix object. This includes
	 * the creation of RDDs for in-memory or binary-block HDFS data.
	 *
	 * @param mo matrix object
	 * @param inputInfo input info
	 * @return JavaPairRDD handle for a matrix object
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForMatrixObject(MatrixObject mo, InputInfo inputInfo) throws DMLRuntimeException {
    //NOTE: MB this logic should be integrated into MatrixObject
    //However, for now we cannot assume that spark libraries are
    //always available and hence only store generic references in
    //matrix object while all the logic is in the SparkExecContext
    JavaSparkContext sc = getSparkContext();
    JavaPairRDD<?, ?> rdd = null;
    //rdd operations if already executed and cached
    if (mo.getRDDHandle() != null && (mo.getRDDHandle().isCheckpointRDD() || !mo.isCached(false))) {
        //return existing rdd handling (w/o input format change)
        rdd = mo.getRDDHandle().getRDD();
    } else //CASE 2: dirty in memory data or cached result of rdd operations
    if (mo.isDirty() || mo.isCached(false)) {
        //get in-memory matrix block and parallelize it
        //w/ guarded parallelize (fallback to export, rdd from file if too large)
        MatrixCharacteristics mc = mo.getMatrixCharacteristics();
        boolean fromFile = false;
        if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
            if (//write if necessary
            mo.isDirty() || !mo.isHDFSFileExists())
                mo.exportData();
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            //cp is workaround for read bug
            rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
            fromFile = true;
        } else {
            //default case
            //pin matrix in memory
            MatrixBlock mb = mo.acquireRead();
            rdd = toMatrixJavaPairRDD(sc, mb, (int) mo.getNumRowsPerBlock(), (int) mo.getNumColumnsPerBlock());
            //unpin matrix
            mo.release();
            _parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
        }
        //keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd, mo.getVarName());
        rddhandle.setHDFSFile(fromFile);
        mo.setRDDHandle(rddhandle);
    } else //CASE 3: non-dirty (file exists on HDFS)
    {
        // For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
        if (inputInfo == InputInfo.BinaryBlockInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            //note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
            //recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
            //cp is workaround for read bug
            rdd = SparkUtils.copyBinaryBlockMatrix((JavaPairRDD<MatrixIndexes, MatrixBlock>) rdd);
        } else if (inputInfo == InputInfo.TextCellInputInfo || inputInfo == InputInfo.CSVInputInfo || inputInfo == InputInfo.MatrixMarketInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            //cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
        } else if (inputInfo == InputInfo.BinaryCellInputInfo) {
            rdd = sc.hadoopFile(mo.getFileName(), inputInfo.inputFormatClass, inputInfo.inputKeyClass, inputInfo.inputValueClass);
            //cp is workaround for read bug
            rdd = ((JavaPairRDD<MatrixIndexes, MatrixCell>) rdd).mapToPair(new CopyBinaryCellFunction());
        } else {
            throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
        }
        //keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd, mo.getVarName());
        rddhandle.setHDFSFile(true);
        mo.setRDDHandle(rddhandle);
    }
    return rdd;
}
Also used : CopyBinaryCellFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyBinaryCellFunction) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 3 with CopyTextInputFunction

use of org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction in project incubator-systemml by apache.

the class SparkExecutionContext method getRDDHandleForFrameObject.

/**
	 * FIXME: currently this implementation assumes matrix representations but frame signature
	 * in order to support the old transform implementation.
	 *
	 * @param fo frame object
	 * @param inputInfo input info
	 * @return JavaPairRDD handle for a frame object
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
@SuppressWarnings("unchecked")
public JavaPairRDD<?, ?> getRDDHandleForFrameObject(FrameObject fo, InputInfo inputInfo) throws DMLRuntimeException {
    //NOTE: MB this logic should be integrated into FrameObject
    //However, for now we cannot assume that spark libraries are
    //always available and hence only store generic references in
    //matrix object while all the logic is in the SparkExecContext
    InputInfo inputInfo2 = (inputInfo == InputInfo.BinaryBlockInputInfo) ? InputInfo.BinaryBlockFrameInputInfo : inputInfo;
    JavaSparkContext sc = getSparkContext();
    JavaPairRDD<?, ?> rdd = null;
    //rdd operations if already executed and cached
    if (fo.getRDDHandle() != null && (fo.getRDDHandle().isCheckpointRDD() || !fo.isCached(false))) {
        //return existing rdd handling (w/o input format change)
        rdd = fo.getRDDHandle().getRDD();
    } else //CASE 2: dirty in memory data or cached result of rdd operations
    if (fo.isDirty() || fo.isCached(false)) {
        //get in-memory matrix block and parallelize it
        //w/ guarded parallelize (fallback to export, rdd from file if too large)
        MatrixCharacteristics mc = fo.getMatrixCharacteristics();
        boolean fromFile = false;
        if (!OptimizerUtils.checkSparkCollectMemoryBudget(mc, 0) || !_parRDDs.reserve(OptimizerUtils.estimatePartitionedSizeExactSparsity(mc))) {
            if (fo.isDirty()) {
                //write only if necessary
                fo.exportData();
            }
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            //cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
            fromFile = true;
        } else {
            //default case
            //pin frame in memory
            FrameBlock fb = fo.acquireRead();
            rdd = toFrameJavaPairRDD(sc, fb);
            //unpin frame
            fo.release();
            _parRDDs.registerRDD(rdd.id(), OptimizerUtils.estimatePartitionedSizeExactSparsity(mc), true);
        }
        //keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd, fo.getVarName());
        rddhandle.setHDFSFile(fromFile);
        fo.setRDDHandle(rddhandle);
    } else //CASE 3: non-dirty (file exists on HDFS)
    {
        // For binary block, these are: SequenceFileInputFormat.class, MatrixIndexes.class, MatrixBlock.class
        if (inputInfo2 == InputInfo.BinaryBlockFrameInputInfo) {
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            //note: this copy is still required in Spark 1.4 because spark hands out whatever the inputformat
            //recordreader returns; the javadoc explicitly recommend to copy all key/value pairs
            //cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, FrameBlock>) rdd).mapToPair(new CopyFrameBlockPairFunction());
        } else if (inputInfo2 == InputInfo.TextCellInputInfo || inputInfo2 == InputInfo.CSVInputInfo || inputInfo2 == InputInfo.MatrixMarketInputInfo) {
            rdd = sc.hadoopFile(fo.getFileName(), inputInfo2.inputFormatClass, inputInfo2.inputKeyClass, inputInfo2.inputValueClass);
            //cp is workaround for read bug
            rdd = ((JavaPairRDD<LongWritable, Text>) rdd).mapToPair(new CopyTextInputFunction());
        } else if (inputInfo2 == InputInfo.BinaryCellInputInfo) {
            throw new DMLRuntimeException("Binarycell not supported for frames.");
        } else {
            throw new DMLRuntimeException("Incorrect input format in getRDDHandleForVariable");
        }
        //keep rdd handle for future operations on it
        RDDObject rddhandle = new RDDObject(rdd, fo.getVarName());
        rddhandle.setHDFSFile(true);
        fo.setRDDHandle(rddhandle);
    }
    return rdd;
}
Also used : CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 4 with CopyTextInputFunction

use of org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction in project incubator-systemml by apache.

the class MLContextConversionUtil method javaRDDStringIJVToMatrixObject.

/**
	 * Convert a {@code JavaRDD<String>} in IJV format to a {@code MatrixObject}
	 * . Note that metadata is required for IJV format.
	 * 
	 * @param variableName
	 *            name of the variable associated with the matrix
	 * @param javaRDD
	 *            the Java RDD of strings
	 * @param matrixMetadata
	 *            matrix metadata
	 * @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
	 */
public static MatrixObject javaRDDStringIJVToMatrixObject(String variableName, JavaRDD<String> javaRDD, MatrixMetadata matrixMetadata) {
    JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
    MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.TextCellOutputInfo, InputInfo.TextCellInputInfo));
    JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
    matrixObject.setRDDHandle(new RDDObject(javaPairRDD2, variableName));
    return matrixObject;
}
Also used : CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ConvertStringToLongTextPair(org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 5 with CopyTextInputFunction

use of org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction in project incubator-systemml by apache.

the class MLContextConversionUtil method javaRDDStringCSVToMatrixObject.

/**
	 * Convert a {@code JavaRDD<String>} in CSV format to a {@code MatrixObject}
	 * 
	 * @param variableName
	 *            name of the variable associated with the matrix
	 * @param javaRDD
	 *            the Java RDD of strings
	 * @param matrixMetadata
	 *            matrix metadata
	 * @return the {@code JavaRDD<String>} converted to a {@code MatrixObject}
	 */
public static MatrixObject javaRDDStringCSVToMatrixObject(String variableName, JavaRDD<String> javaRDD, MatrixMetadata matrixMetadata) {
    JavaPairRDD<LongWritable, Text> javaPairRDD = javaRDD.mapToPair(new ConvertStringToLongTextPair());
    MatrixCharacteristics mc = (matrixMetadata != null) ? matrixMetadata.asMatrixCharacteristics() : new MatrixCharacteristics();
    MatrixObject matrixObject = new MatrixObject(ValueType.DOUBLE, OptimizerUtils.getUniqueTempFileName(), new MatrixFormatMetaData(mc, OutputInfo.CSVOutputInfo, InputInfo.CSVInputInfo));
    JavaPairRDD<LongWritable, Text> javaPairRDD2 = javaPairRDD.mapToPair(new CopyTextInputFunction());
    matrixObject.setRDDHandle(new RDDObject(javaPairRDD2, variableName));
    return matrixObject;
}
Also used : CopyTextInputFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ConvertStringToLongTextPair(org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) MatrixFormatMetaData(org.apache.sysml.runtime.matrix.MatrixFormatMetaData) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Aggregations

LongWritable (org.apache.hadoop.io.LongWritable)6 Text (org.apache.hadoop.io.Text)6 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)6 CopyTextInputFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyTextInputFunction)6 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)6 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)4 ConvertStringToLongTextPair (org.apache.sysml.runtime.instructions.spark.functions.ConvertStringToLongTextPair)4 MatrixFormatMetaData (org.apache.sysml.runtime.matrix.MatrixFormatMetaData)4 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)3 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 ValueType (org.apache.sysml.parser.Expression.ValueType)2 FrameObject (org.apache.sysml.runtime.controlprogram.caching.FrameObject)2 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)2 CopyBinaryCellFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyBinaryCellFunction)1 CopyFrameBlockPairFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction)1 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)1 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)1 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)1