Examples with CSVFileFormatProperties - org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties

Example 16 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class TransformFrameEncodeDecodeTest method runTransformTest.

/**
 * @param rt
 * @param ofmt
 * @param dataset
 */
private void runTransformTest(RUNTIME_PLATFORM rt, String ofmt, TransformType type, boolean colnames) {
    // set runtime platform
    RUNTIME_PLATFORM rtold = rtplatform;
    rtplatform = rt;
    boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
    if (rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)
        DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    // set transform specification
    String SPEC = null;
    String DATASET = null;
    switch(type) {
        case RECODE:
            SPEC = colnames ? SPEC1b : SPEC1;
            DATASET = DATASET1;
            break;
        case DUMMY:
            SPEC = colnames ? SPEC2b : SPEC2;
            DATASET = DATASET1;
            break;
        default:
            throw new RuntimeException("Unsupported transform type for encode/decode test.");
    }
    if (!ofmt.equals("csv"))
        throw new RuntimeException("Unsupported test output format");
    try {
        getAndLoadTestConfiguration(TEST_NAME1);
        String HOME = SCRIPT_DIR + TEST_DIR;
        fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
        programArgs = new String[] { "-explain", "-nvargs", "DATA=" + HOME + "input/" + DATASET, "TFSPEC=" + HOME + "input/" + SPEC, "TFDATA=" + output("tfout"), "SEP=,", "OFMT=" + ofmt, "OSEP=," };
        // Originally OSEP was set to
        // OSEP=","
        // Apache Commons CLI strips away the leading and trailing quotes, leaving us with
        // OSEP=",
        // This is just a feature/bug and is reported in CLI-262,
        // though even a fix is unlikely to be backported to 1.2
        runTest(true, false, null, -1);
        // read input/output and compare
        FrameReader reader1 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo, new CSVFileFormatProperties(true, ",", false));
        FrameBlock fb1 = reader1.readFrameFromHDFS(HOME + "input/" + DATASET, -1L, -1L);
        FrameReader reader2 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo);
        FrameBlock fb2 = reader2.readFrameFromHDFS(output("tfout"), -1L, -1L);
        String[][] R1 = DataConverter.convertToStringFrame(fb1);
        String[][] R2 = DataConverter.convertToStringFrame(fb2);
        TestUtils.compareFrames(R1, R2, R1.length, R1[0].length);
        if (rt == RUNTIME_PLATFORM.HYBRID_SPARK) {
            Assert.assertEquals("Wrong number of executed Spark instructions: " + Statistics.getNoOfExecutedSPInst(), new Long(2), new Long(Statistics.getNoOfExecutedSPInst()));
        }
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    } finally {
        rtplatform = rtold;
        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
    }
}

Also used : RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) FrameReader(org.apache.sysml.runtime.io.FrameReader)

Example 17 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class CSVReadUnknownSizeTest method runCSVReadUnknownSizeTest.

/**
 * @param condition
 * @param branchRemoval
 * @param IPA
 */
private void runCSVReadUnknownSizeTest(boolean splitDags, boolean rewrites) {
    boolean oldFlagSplit = OptimizerUtils.ALLOW_SPLIT_HOP_DAGS;
    boolean oldFlagRewrites = OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION;
    try {
        getAndLoadTestConfiguration(TEST_NAME);
        /* This is for running the junit test the new way, i.e., construct the arguments directly */
        String HOME = SCRIPT_DIR + TEST_DIR;
        fullDMLScriptName = HOME + TEST_NAME + ".dml";
        programArgs = new String[] { "-explain", "-args", input("X"), output("R") };
        fullRScriptName = HOME + TEST_NAME + ".R";
        rCmd = "Rscript" + " " + fullRScriptName + " " + inputDir() + " " + expectedDir();
        OptimizerUtils.ALLOW_SPLIT_HOP_DAGS = splitDags;
        OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = rewrites;
        double[][] X = getRandomMatrix(rows, cols, -1, 1, 1.0d, 7);
        MatrixBlock mb = DataConverter.convertToMatrixBlock(X);
        MatrixCharacteristics mc = new MatrixCharacteristics(rows, cols, 1000, 1000);
        CSVFileFormatProperties fprop = new CSVFileFormatProperties();
        DataConverter.writeMatrixToHDFS(mb, input("X"), OutputInfo.CSVOutputInfo, mc, -1, fprop);
        mc.set(-1, -1, -1, -1);
        MapReduceTool.writeMetaDataFile(input("X.mtd"), ValueType.DOUBLE, mc, OutputInfo.CSVOutputInfo, fprop);
        runTest(true, false, null, -1);
        // compare matrices
        HashMap<CellIndex, Double> dmlfile = readDMLMatrixFromHDFS("R");
        for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) {
            Double tmp = dmlfile.get(new CellIndex(i + 1, j + 1));
            double expectedValue = mb.quickGetValue(i, j);
            double actualValue = (tmp == null) ? 0.0 : tmp;
            if (expectedValue != actualValue) {
                throw new Exception(String.format("Value of cell (%d,%d) " + "(zero-based indices) in output file %s is %f, " + "but original value was %f", i, j, baseDirectory + OUTPUT_DIR + "R", actualValue, expectedValue));
            }
        }
        // check expected number of compiled and executed MR jobs
        // note: with algebraic rewrites - unary op in reducer prevents job-level recompile
        // reblock, GMR
        int expectedNumCompiled = (rewrites && !splitDags) ? 2 : 3;
        int expectedNumExecuted = splitDags ? 0 : rewrites ? 2 : 2;
        checkNumCompiledMRJobs(expectedNumCompiled);
        checkNumExecutedMRJobs(expectedNumExecuted);
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    } finally {
        OptimizerUtils.ALLOW_SPLIT_HOP_DAGS = oldFlagSplit;
        OptimizerUtils.ALLOW_ALGEBRAIC_SIMPLIFICATION = oldFlagRewrites;
    }
}

Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) CellIndex(org.apache.sysml.runtime.matrix.data.MatrixValue.CellIndex) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics)

Example 18 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class FrameConverterTest method runConverter.

@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
    SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
    JavaSparkContext sc = sec.getSparkContext();
    ValueType[] lschema = schema.toArray(new ValueType[0]);
    MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
    switch(type) {
        case CSV2BIN:
            {
                InputInfo iinfo = InputInfo.CSVInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2CSV:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                CSVFileFormatProperties fprop = new CSVFileFormatProperties();
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case TXTCELL2BIN:
            {
                InputInfo iinfo = InputInfo.TextCellInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2TXTCELL:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
                JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
                JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
                rddOut.saveAsTextFile(fnameOut);
                break;
            }
        case MAT2BIN:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2MAT:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
                rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
                break;
            }
        case DFRM2BIN:
            {
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                // Create DataFrame
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
                JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
                Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        case BIN2DFRM:
            {
                InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
                OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
                JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
                SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
                Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
                // Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
                JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
                rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
                break;
            }
        default:
            throw new RuntimeException("Unsuported converter type: " + type.toString());
    }
    sec.close();
}

Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) ValueType(org.apache.sysml.parser.Expression.ValueType) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Dataset(org.apache.spark.sql.Dataset) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritableFrameToLongFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongWritableFrameToLongFrameFunction) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction) CopyFrameBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyFrameBlockPairFunction)

Example 19 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class TransformFrameEncodeColmapTest method runTransformTest.

private void runTransformTest(String testname, RUNTIME_PLATFORM rt, String ofmt, boolean colnames) {
    // set runtime platform
    RUNTIME_PLATFORM rtold = rtplatform;
    rtplatform = rt;
    boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
    if (rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)
        DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    // set transform specification
    String DATASET = DATASET1;
    String SPEC = colnames ? SPEC1b : SPEC1;
    if (!ofmt.equals("csv"))
        throw new RuntimeException("Unsupported test output format");
    try {
        getAndLoadTestConfiguration(testname);
        String HOME = SCRIPT_DIR + TEST_DIR;
        fullDMLScriptName = HOME + testname + ".dml";
        programArgs = new String[] { "-explain", "-nvargs", "DATA=" + HOME + "input/" + DATASET, "TFSPEC=" + HOME + "input/" + SPEC, "TFDATA=" + output("tfout"), "OFMT=" + ofmt, "OSEP=," };
        runTest(true, false, null, -1);
        // read input/output and compare
        FrameReader reader1 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo, new CSVFileFormatProperties(true, ",", false));
        FrameBlock fb1 = reader1.readFrameFromHDFS(HOME + "input/" + DATASET, -1L, -1L);
        FrameReader reader2 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo);
        FrameBlock fb2 = reader2.readFrameFromHDFS(output("tfout"), -1L, -1L);
        String[][] R1 = DataConverter.convertToStringFrame(fb1);
        String[][] R2 = DataConverter.convertToStringFrame(fb2);
        TestUtils.compareFrames(R1, R2, R1.length, R1[0].length);
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    } finally {
        rtplatform = rtold;
        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
    }
}

Example 20 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class TransformFrameEncodeDecodeTokenTest method runTransformTest.

/**
 * @param rt
 * @param ofmt
 * @param dataset
 */
private void runTransformTest(RUNTIME_PLATFORM rt, String ofmt) {
    // set runtime platform
    RUNTIME_PLATFORM rtold = rtplatform;
    rtplatform = rt;
    boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
    if (rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)
        DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    if (!ofmt.equals("csv"))
        throw new RuntimeException("Unsupported test output format");
    try {
        getAndLoadTestConfiguration(TEST_NAME1);
        String HOME = SCRIPT_DIR + TEST_DIR;
        fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
        programArgs = new String[] { "-explain", "-nvargs", "DATA=" + HOME + "input/" + DATASET1, "TFSPEC=" + HOME + "input/" + SPEC1, "TFDATA=" + output("tfout"), "SEP= ", "OFMT=" + ofmt, "OSEP= " };
        runTest(true, false, null, -1);
        // read input/output and compare
        FrameReader reader1 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo, new CSVFileFormatProperties(false, " ", false));
        FrameBlock fb1 = reader1.readFrameFromHDFS(HOME + "input/" + DATASET1, -1L, -1L);
        FrameReader reader2 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo, new CSVFileFormatProperties(false, " ", false));
        FrameBlock fb2 = reader2.readFrameFromHDFS(output("tfout"), -1L, -1L);
        String[][] R1 = DataConverter.convertToStringFrame(fb1);
        String[][] R2 = DataConverter.convertToStringFrame(fb2);
        TestUtils.compareFrames(R1, R2, R1.length, R1[0].length);
        if (rt == RUNTIME_PLATFORM.HYBRID_SPARK) {
            Assert.assertEquals("Wrong number of executed Spark instructions: " + Statistics.getNoOfExecutedSPInst(), new Long(2), new Long(Statistics.getNoOfExecutedSPInst()));
        }
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    } finally {
        rtplatform = rtold;
        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
    }
}

Aggregations

CSVFileFormatProperties (org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties)25 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)11 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)10 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)9 FrameReader (org.apache.sysml.runtime.io.FrameReader)8 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)7 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)6 LongWritable (org.apache.hadoop.io.LongWritable)4 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)4 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)4 IOException (java.io.IOException)3 Text (org.apache.hadoop.io.Text)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)3 BufferedWriter (java.io.BufferedWriter)2 OutputStreamWriter (java.io.OutputStreamWriter)2 ArrayList (java.util.ArrayList)2 ValueType (org.apache.sysml.parser.Expression.ValueType)2 LongFrameToLongWritableFrameFunction (org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction)2 ReaderBinaryBlock (org.apache.sysml.runtime.io.ReaderBinaryBlock)2