Search in sources :

Example 66 with OutputInfo

use of org.apache.sysml.runtime.matrix.data.OutputInfo in project systemml by apache.

the class Dag method getOutputInfo.

/**
 * Method that determines the output format for a given node.
 *
 * @param node low-level operator
 * @param cellModeOverride override mode
 * @return output info
 */
private static OutputInfo getOutputInfo(Lop node, boolean cellModeOverride) {
    if ((node.getDataType() == DataType.SCALAR && node.getExecType() == ExecType.CP) || node instanceof FunctionCallCP)
        return null;
    OutputInfo oinfo = null;
    OutputParameters oparams = node.getOutputParameters();
    if (oparams.isBlocked()) {
        if (!cellModeOverride)
            oinfo = OutputInfo.BinaryBlockOutputInfo;
        else {
            // output format is overridden, for example, due to recordReaderInstructions in the job
            oinfo = OutputInfo.BinaryCellOutputInfo;
            // which stores the outputInfo.
            try {
                oparams.setDimensions(oparams.getNumRows(), oparams.getNumCols(), -1, -1, oparams.getNnz(), oparams.getUpdateType());
            } catch (HopsException e) {
                throw new LopsException(node.printErrorLocation() + "error in getOutputInfo in Dag ", e);
            }
        }
    } else {
        if (oparams.getFormat() == Format.TEXT || oparams.getFormat() == Format.MM)
            oinfo = OutputInfo.TextCellOutputInfo;
        else if (oparams.getFormat() == Format.CSV) {
            oinfo = OutputInfo.CSVOutputInfo;
        } else {
            oinfo = OutputInfo.BinaryCellOutputInfo;
        }
    }
    /* Instead of following hardcoding, one must get this information from Lops */
    if (node.getType() == Type.SortKeys && node.getExecType() == ExecType.MR) {
        if (((SortKeys) node).getOpType() == SortKeys.OperationTypes.Indexes)
            oinfo = OutputInfo.BinaryBlockOutputInfo;
        else
            oinfo = OutputInfo.OutputInfoForSortOutput;
    } else if (node.getType() == Type.CombineBinary) {
        // Output format of CombineBinary (CB) depends on how the output is consumed
        CombineBinary combine = (CombineBinary) node;
        if (combine.getOperation() == org.apache.sysml.lops.CombineBinary.OperationTypes.PreSort) {
            oinfo = OutputInfo.OutputInfoForSortInput;
        } else if (combine.getOperation() == org.apache.sysml.lops.CombineBinary.OperationTypes.PreCentralMoment || combine.getOperation() == org.apache.sysml.lops.CombineBinary.OperationTypes.PreCovUnweighted || combine.getOperation() == org.apache.sysml.lops.CombineBinary.OperationTypes.PreGroupedAggUnweighted) {
            oinfo = OutputInfo.WeightedPairOutputInfo;
        }
    } else if (node.getType() == Type.CombineTernary) {
        oinfo = OutputInfo.WeightedPairOutputInfo;
    } else if (node.getType() == Type.CentralMoment || node.getType() == Type.CoVariance) {
        // CMMR always operate in "cell mode",
        // and the output is always in cell format
        oinfo = OutputInfo.BinaryCellOutputInfo;
    }
    return oinfo;
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) CombineBinary(org.apache.sysml.lops.CombineBinary) OutputParameters(org.apache.sysml.lops.OutputParameters) LopsException(org.apache.sysml.lops.LopsException) FunctionCallCP(org.apache.sysml.lops.FunctionCallCP) HopsException(org.apache.sysml.hops.HopsException)

Example 67 with OutputInfo

use of org.apache.sysml.runtime.matrix.data.OutputInfo in project systemml by apache.

the class WriteSPInstruction method processInstruction.

@Override
public void processInstruction(ExecutionContext ec) {
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    // get filename (literal or variable expression)
    String fname = ec.getScalarInput(input2.getName(), ValueType.STRING, input2.isLiteral()).getStringValue();
    String desc = ec.getScalarInput(input4.getName(), ValueType.STRING, input4.isLiteral()).getStringValue();
    formatProperties.setDescription(desc);
    ValueType[] schema = (input1.getDataType() == DataType.FRAME) ? sec.getFrameObject(input1.getName()).getSchema() : null;
    try {
        // if the file already exists on HDFS, remove it.
        MapReduceTool.deleteFileIfExistOnHDFS(fname);
        // prepare output info according to meta data
        String outFmt = input3.getName();
        OutputInfo oi = OutputInfo.stringToOutputInfo(outFmt);
        // core matrix/frame write
        if (input1.getDataType() == DataType.MATRIX)
            processMatrixWriteInstruction(sec, fname, oi);
        else
            processFrameWriteInstruction(sec, fname, oi, schema);
    } catch (IOException ex) {
        throw new DMLRuntimeException("Failed to process write instruction", ex);
    }
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) ValueType(org.apache.sysml.parser.Expression.ValueType) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 68 with OutputInfo

use of org.apache.sysml.runtime.matrix.data.OutputInfo in project systemml by apache.

the class FrameConverterTest method runConverter.

@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
    SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
    JavaSparkContext sc = sec.getSparkContext();
    ValueType[] lschema = schema.toArray(new ValueType[0]);
    MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
    switch(type) {
        case CSV2BIN:
            {
                InputInfo iinfo = InputInfo.CSVInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2CSV:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                CSVFileFormatProperties fprop = new CSVFileFormatProperties();
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case TXTCELL2BIN:
            {
                InputInfo iinfo = InputInfo.TextCellInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2TXTCELL:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case MAT2BIN:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2MAT:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
                break;
            }
        case DFRM2BIN:
            {
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                // Create DataFrame
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
                JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
                Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2DFRM:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
                // Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        default:
            throw new RuntimeException("Unsuported converter type: " + type.toString());
    }
    sec.close();
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Dataset(org.apache.spark.sql.Dataset) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritableFrameToLongFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongWritableFrameToLongFrameFunction) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction)

Example 69 with OutputInfo

use of org.apache.sysml.runtime.matrix.data.OutputInfo in project systemml by apache.

the class FrameConverterTest method runFrameConverterTest.

/**
 * @param schema
 * @param type
 * @param instType
 */
private void runFrameConverterTest(ValueType[] schema, ConvType type) {
    RUNTIME_PLATFORM platformOld = rtplatform;
    DMLScript.rtplatform = RUNTIME_PLATFORM.SPARK;
    boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
    DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    try {
        TestConfiguration config = getTestConfiguration(TEST_NAME);
        loadTestConfiguration(config);
        // data generation
        double[][] A = getRandomMatrix(rows, schema.length, -10, 10, 0.9, 2373);
        // prepare input/output infos
        OutputInfo oinfo = null;
        InputInfo iinfo = null;
        switch(type) {
            case CSV2BIN:
            case DFRM2BIN:
                oinfo = OutputInfo.CSVOutputInfo;
                iinfo = InputInfo.BinaryBlockInputInfo;
                break;
            case BIN2CSV:
                oinfo = OutputInfo.BinaryBlockOutputInfo;
                iinfo = InputInfo.CSVInputInfo;
                break;
            case TXTCELL2BIN:
                oinfo = OutputInfo.TextCellOutputInfo;
                iinfo = InputInfo.BinaryBlockInputInfo;
                break;
            case BIN2TXTCELL:
                oinfo = OutputInfo.BinaryBlockOutputInfo;
                iinfo = InputInfo.TextCellInputInfo;
                break;
            case MAT2BIN:
            case BIN2DFRM:
                oinfo = OutputInfo.BinaryBlockOutputInfo;
                iinfo = InputInfo.BinaryBlockInputInfo;
                break;
            case BIN2MAT:
                oinfo = OutputInfo.BinaryBlockOutputInfo;
                iinfo = InputInfo.BinaryBlockInputInfo;
                break;
            default:
                throw new RuntimeException("Unsuported converter type: " + type.toString());
        }
        if (type == ConvType.MAT2BIN || type == ConvType.BIN2MAT)
            runMatrixConverterAndVerify(schema, A, type, iinfo, oinfo);
        else
            runConverterAndVerify(schema, A, type, iinfo, oinfo);
    } catch (Exception ex) {
        ex.printStackTrace();
        throw new RuntimeException(ex);
    } finally {
        DMLScript.rtplatform = platformOld;
        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
    }
}
Also used : RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) TestConfiguration(org.apache.sysml.test.integration.TestConfiguration) IOException(java.io.IOException)

Aggregations

OutputInfo (org.apache.sysml.runtime.matrix.data.OutputInfo)69 MetaDataFormat (org.apache.sysml.runtime.matrix.MetaDataFormat)34 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)30 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)28 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)25 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)22 IOException (java.io.IOException)16 ValueType (org.apache.sysml.parser.Expression.ValueType)10 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)10 HashMap (java.util.HashMap)6 FrameWriter (org.apache.sysml.runtime.io.FrameWriter)6 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)5 Matrix (org.apache.sysml.udf.Matrix)5 Scalar (org.apache.sysml.udf.Scalar)5 ArrayList (java.util.ArrayList)4 Path (org.apache.hadoop.fs.Path)4 JobConf (org.apache.hadoop.mapred.JobConf)4 RunningJob (org.apache.hadoop.mapred.RunningJob)4 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)4 LopsException (org.apache.sysml.lops.LopsException)4