Search in sources :

Example 21 with OutputInfo

use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.

the class FrameConverterTest method runConverter.

@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
    SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
    JavaSparkContext sc = sec.getSparkContext();
    ValueType[] lschema = schema.toArray(new ValueType[0]);
    MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
    switch(type) {
        case CSV2BIN:
            {
                InputInfo iinfo = InputInfo.CSVInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2CSV:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                CSVFileFormatProperties fprop = new CSVFileFormatProperties();
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case TXTCELL2BIN:
            {
                InputInfo iinfo = InputInfo.TextCellInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2TXTCELL:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case MAT2BIN:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2MAT:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
                break;
            }
        case DFRM2BIN:
            {
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                // Create DataFrame
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
                JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
                Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2DFRM:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
                // Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        default:
            throw new RuntimeException("Unsuported converter type: " + type.toString());
    }
    sec.close();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Dataset(org.apache.spark.sql.Dataset) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritableFrameToLongFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongWritableFrameToLongFrameFunction) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction)

Example 22 with OutputInfo

use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.

the class FrameObject method writeBlobToHDFS.

@Override
protected void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop) throws IOException, DMLRuntimeException {
    OutputInfo oinfo = OutputInfo.stringToOutputInfo(ofmt);
    FrameWriter writer = FrameWriterFactory.createFrameWriter(oinfo, fprop);
    writer.writeFrameToHDFS(_data, fname, getNumRows(), getNumColumns());
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) FrameWriter(org.apache.sysml.runtime.io.FrameWriter)

Example 23 with OutputInfo

use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.

the class MatrixObject method writeBlobFromRDDtoHDFS.

@Override
protected void writeBlobFromRDDtoHDFS(RDDObject rdd, String fname, String outputFormat) throws IOException, DMLRuntimeException {
    // prepare output info
    MetaDataFormat iimd = (MetaDataFormat) _metaData;
    OutputInfo oinfo = (outputFormat != null ? OutputInfo.stringToOutputInfo(outputFormat) : InputInfo.getMatchingOutputInfo(iimd.getInputInfo()));
    // note: the write of an RDD to HDFS might trigger
    // lazy evaluation of pending transformations.
    long newnnz = SparkExecutionContext.writeRDDtoHDFS(rdd, fname, oinfo);
    _metaData.getMatrixCharacteristics().setNonZeros(newnnz);
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat)

Example 24 with OutputInfo

use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.

the class MatrixObject method writeBlobToHDFS.

/**
 * Writes in-memory matrix to HDFS in a specified format.
 */
@Override
protected void writeBlobToHDFS(String fname, String ofmt, int rep, FileFormatProperties fprop) throws IOException, DMLRuntimeException {
    long begin = 0;
    if (LOG.isTraceEnabled()) {
        LOG.trace(" Writing matrix to HDFS...  " + hashCode() + "  Path: " + fname + ", Format: " + (ofmt != null ? ofmt : "inferred from metadata"));
        begin = System.currentTimeMillis();
    }
    MetaDataFormat iimd = (MetaDataFormat) _metaData;
    if (_data != null) {
        // Get the dimension information from the metadata stored within MatrixObject
        MatrixCharacteristics mc = iimd.getMatrixCharacteristics();
        // Write the matrix to HDFS in requested format
        OutputInfo oinfo = (ofmt != null ? OutputInfo.stringToOutputInfo(ofmt) : InputInfo.getMatchingOutputInfo(iimd.getInputInfo()));
        // note: this is only required if singlenode (due to binarycell default)
        if (oinfo == OutputInfo.BinaryBlockOutputInfo && DMLScript.rtplatform == RUNTIME_PLATFORM.SINGLE_NODE && (mc.getRowsPerBlock() != ConfigurationManager.getBlocksize() || mc.getColsPerBlock() != ConfigurationManager.getBlocksize())) {
            DataConverter.writeMatrixToHDFS(_data, fname, oinfo, new MatrixCharacteristics(mc.getRows(), mc.getCols(), ConfigurationManager.getBlocksize(), ConfigurationManager.getBlocksize(), mc.getNonZeros()), rep, fprop);
        } else {
            DataConverter.writeMatrixToHDFS(_data, fname, oinfo, mc, rep, fprop);
        }
        if (LOG.isTraceEnabled())
            LOG.trace("Writing matrix to HDFS (" + fname + ") - COMPLETED... " + (System.currentTimeMillis() - begin) + " msec.");
    } else if (LOG.isTraceEnabled()) {
        LOG.trace("Writing matrix to HDFS (" + fname + ") - NOTHING TO WRITE (_data == null).");
    }
    if (DMLScript.STATISTICS)
        CacheStatistics.incrementHDFSWrites();
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 25 with OutputInfo

use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.

the class DataPartitionerRemoteReducer method configure.

public void configure(JobConf job) {
    String fnameNew = MRJobConfiguration.getPartitioningFilename(job);
    OutputInfo oi = MRJobConfiguration.getPartitioningOutputInfo(job);
    if (oi == OutputInfo.TextCellOutputInfo)
        _reducer = new DataPartitionerReducerTextcell(job, fnameNew);
    else if (oi == OutputInfo.BinaryCellOutputInfo)
        _reducer = new DataPartitionerReducerBinarycell(job, fnameNew);
    else if (oi == OutputInfo.BinaryBlockOutputInfo)
        _reducer = new DataPartitionerReducerBinaryblock(job, fnameNew);
    else
        throw new RuntimeException("Unable to configure reducer with unknown output info: " + oi.toString());
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo)

Aggregations

OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)35 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)17 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)15 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)14 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)13 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)11 IOException (java.io.IOException)8 ValueType (org.apache.sysml.parser.Expression.ValueType)5 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)5 HashMap (java.util.HashMap)3 FrameWriter (org.apache.sysml.runtime.io.FrameWriter)3 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)3 Matrix (org.apache.sysml.udf.Matrix)3 Scalar (org.apache.sysml.udf.Scalar)3 ArrayList (java.util.ArrayList)2 Path (org.apache.hadoop.fs.Path)2 JobConf (org.apache.hadoop.mapred.JobConf)2 RunningJob (org.apache.hadoop.mapred.RunningJob)2 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)2 LopsException (org.apache.sysml.lops.LopsException)2