Search in sources :

Example 11 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class WriterTextCSV method writeCSVMatrixToFile.

protected static void writeCSVMatrixToFile(Path path, JobConf job, FileSystem fs, MatrixBlock src, int rl, int ru, CSVFileFormatProperties props) throws IOException {
    boolean sparse = src.isInSparseFormat();
    int clen = src.getNumColumns();
    // create buffered writer
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
    try {
        // for obj reuse and preventing repeated buffer re-allocations
        StringBuilder sb = new StringBuilder();
        props = (props == null) ? new CSVFileFormatProperties() : props;
        String delim = props.getDelim();
        boolean csvsparse = props.isSparse();
        // Write header line, if needed
        if (props.hasHeader() && rl == 0) {
            // write row chunk-wise to prevent OOM on large number of columns
            for (int bj = 0; bj < clen; bj += BLOCKSIZE_J) {
                for (int j = bj; j < Math.min(clen, bj + BLOCKSIZE_J); j++) {
                    sb.append("C" + (j + 1));
                    if (j < clen - 1)
                        sb.append(delim);
                }
                br.write(sb.toString());
                sb.setLength(0);
            }
            sb.append('\n');
            br.write(sb.toString());
            sb.setLength(0);
        }
        // Write data lines
        if (// SPARSE
        sparse) {
            SparseBlock sblock = src.getSparseBlock();
            for (int i = rl; i < ru; i++) {
                // write row chunk-wise to prevent OOM on large number of columns
                int prev_jix = -1;
                if (sblock != null && i < sblock.numRows() && !sblock.isEmpty(i)) {
                    int pos = sblock.pos(i);
                    int alen = sblock.size(i);
                    int[] aix = sblock.indexes(i);
                    double[] avals = sblock.values(i);
                    for (int j = pos; j < pos + alen; j++) {
                        int jix = aix[j];
                        // output empty fields, if needed
                        for (int j2 = prev_jix; j2 < jix - 1; j2++) {
                            if (!csvsparse)
                                sb.append('0');
                            sb.append(delim);
                            // flush buffered string
                            if (j2 % BLOCKSIZE_J == 0) {
                                br.write(sb.toString());
                                sb.setLength(0);
                            }
                        }
                        // output the value (non-zero)
                        sb.append(avals[j]);
                        if (jix < clen - 1)
                            sb.append(delim);
                        br.write(sb.toString());
                        sb.setLength(0);
                        // flush buffered string
                        if (jix % BLOCKSIZE_J == 0) {
                            br.write(sb.toString());
                            sb.setLength(0);
                        }
                        prev_jix = jix;
                    }
                }
                // In case of an empty row, output (clen-1) empty fields
                for (int bj = prev_jix + 1; bj < clen; bj += BLOCKSIZE_J) {
                    for (int j = bj; j < Math.min(clen, bj + BLOCKSIZE_J); j++) {
                        if (!csvsparse)
                            sb.append('0');
                        if (j < clen - 1)
                            sb.append(delim);
                    }
                    br.write(sb.toString());
                    sb.setLength(0);
                }
                sb.append('\n');
                br.write(sb.toString());
                sb.setLength(0);
            }
        } else // DENSE
        {
            for (int i = rl; i < ru; i++) {
                // write row chunk-wise to prevent OOM on large number of columns
                for (int bj = 0; bj < clen; bj += BLOCKSIZE_J) {
                    for (int j = bj; j < Math.min(clen, bj + BLOCKSIZE_J); j++) {
                        double lvalue = src.getValueDenseUnsafe(i, j);
                        if (// for nnz
                        lvalue != 0)
                            sb.append(lvalue);
                        else if (!csvsparse)
                            sb.append('0');
                        if (j != clen - 1)
                            sb.append(delim);
                    }
                    br.write(sb.toString());
                    sb.setLength(0);
                }
                sb.append('\n');
                // same as append
                br.write(sb.toString());
                sb.setLength(0);
            }
        }
    } finally {
        IOUtilFunctions.closeSilently(br);
    }
}
Also used : CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) OutputStreamWriter(java.io.OutputStreamWriter) SparseBlock(org.apache.sysml.runtime.matrix.data.SparseBlock) BufferedWriter(java.io.BufferedWriter)

Example 12 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class FrameReadWriteTest method runFrameReadWriteTest.

/**
 * @param sparseM1
 * @param sparseM2
 * @param instType
 */
private void runFrameReadWriteTest(OutputInfo oinfo, ValueType[] schema1, ValueType[] schema2, boolean parallel) {
    boolean oldParText = CompilerConfig.FLAG_PARREADWRITE_TEXT;
    boolean oldParBin = CompilerConfig.FLAG_PARREADWRITE_BINARY;
    try {
        CompilerConfig.FLAG_PARREADWRITE_TEXT = parallel;
        CompilerConfig.FLAG_PARREADWRITE_BINARY = parallel;
        ConfigurationManager.setGlobalConfig(new CompilerConfig());
        // data generation
        double[][] A = getRandomMatrix(rows, schema1.length, -10, 10, 0.9, 2373);
        double[][] B = getRandomMatrix(rows, schema2.length, -10, 10, 0.9, 129);
        // Initialize the frame data.
        // init data frame 1
        FrameBlock frame1 = new FrameBlock(schema1);
        initFrameData(frame1, A, schema1);
        // init data frame 2
        FrameBlock frame2 = new FrameBlock(schema2);
        initFrameData(frame2, B, schema2);
        // Write frame data to disk
        CSVFileFormatProperties fprop = new CSVFileFormatProperties();
        fprop.setDelim(DELIMITER);
        fprop.setHeader(HEADER);
        writeAndVerifyData(oinfo, frame1, frame2, fprop);
    } catch (Exception ex) {
        ex.printStackTrace();
        throw new RuntimeException(ex);
    } finally {
        CompilerConfig.FLAG_PARREADWRITE_TEXT = oldParText;
        CompilerConfig.FLAG_PARREADWRITE_BINARY = oldParBin;
        ConfigurationManager.setGlobalConfig(new CompilerConfig());
    }
}
Also used : CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) CompilerConfig(org.apache.sysml.conf.CompilerConfig) IOException(java.io.IOException)

Example 13 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class FrameCSVReadWriteTest method runCSVQuotesReadWriteTest.

/**
 * @param rt
 * @param ofmt
 * @param dataset
 */
private void runCSVQuotesReadWriteTest(RUNTIME_PLATFORM rt, String ofmt) {
    // set runtime platform
    RUNTIME_PLATFORM rtold = rtplatform;
    rtplatform = rt;
    boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
    if (rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)
        DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    if (!ofmt.equals("csv"))
        throw new RuntimeException("Unsupported test output format");
    try {
        getAndLoadTestConfiguration(TEST_NAME1);
        String HOME = SCRIPT_DIR + TEST_DIR;
        fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
        programArgs = new String[] { "-explain", "-args", HOME + "input/" + DATASET, output("R") };
        runTest(true, false, null, -1);
        // read input/output and compare
        FrameReader reader1 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo, new CSVFileFormatProperties(false, ",", false));
        FrameBlock fb1 = reader1.readFrameFromHDFS(HOME + "input/" + DATASET, -1L, -1L);
        FrameReader reader2 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo);
        FrameBlock fb2 = reader2.readFrameFromHDFS(output("R"), -1L, -1L);
        String[][] R1 = DataConverter.convertToStringFrame(fb1);
        String[][] R2 = DataConverter.convertToStringFrame(fb2);
        TestUtils.compareFrames(R1, R2, R1.length, R1[0].length);
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    } finally {
        rtplatform = rtold;
        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
    }
}
Also used : RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) FrameReader(org.apache.sysml.runtime.io.FrameReader)

Example 14 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class TransformCSVFrameEncodeDecodeTest method runTransformTest.

/**
 * @param rt
 * @param ofmt
 * @param dataset
 */
private void runTransformTest(RUNTIME_PLATFORM rt, String ofmt) {
    // set runtime platform
    RUNTIME_PLATFORM rtold = rtplatform;
    rtplatform = rt;
    boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
    if (rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)
        DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    if (!ofmt.equals("csv"))
        throw new RuntimeException("Unsupported test output format");
    try {
        getAndLoadTestConfiguration(TEST_NAME1);
        String HOME = SCRIPT_DIR + TEST_DIR;
        fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
        programArgs = new String[] { "-explain", "-args", HOME + "input/" + DATASET, output("R") };
        runTest(true, false, null, -1);
        // read input/output and compare
        FrameReader reader1 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo, new CSVFileFormatProperties(false, ",", false));
        FrameBlock fb1 = reader1.readFrameFromHDFS(HOME + "input/" + DATASET, -1L, -1L);
        FrameReader reader2 = FrameReaderFactory.createFrameReader(InputInfo.CSVInputInfo);
        FrameBlock fb2 = reader2.readFrameFromHDFS(output("R"), -1L, -1L);
        String[][] R1 = DataConverter.convertToStringFrame(fb1);
        String[][] R2 = DataConverter.convertToStringFrame(fb2);
        TestUtils.compareFrames(R1, R2, R1.length, R1[0].length);
        if (rt == RUNTIME_PLATFORM.HYBRID_SPARK) {
            Assert.assertEquals("Wrong number of executed Spark instructions: " + Statistics.getNoOfExecutedSPInst(), new Long(2), new Long(Statistics.getNoOfExecutedSPInst()));
        }
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    } finally {
        rtplatform = rtold;
        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
    }
}
Also used : RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) FrameReader(org.apache.sysml.runtime.io.FrameReader)

Example 15 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class TransformCSVFrameEncodeReadTest method runTransformTest.

/**
 * @param rt
 * @param ofmt
 * @param dataset
 */
private void runTransformTest(RUNTIME_PLATFORM rt, String ofmt, boolean subset, boolean parRead) {
    // set runtime platform
    RUNTIME_PLATFORM rtold = rtplatform;
    rtplatform = rt;
    boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
    if (rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)
        DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    if (!ofmt.equals("csv"))
        throw new RuntimeException("Unsupported test output format");
    try {
        getAndLoadTestConfiguration(TEST_NAME1);
        String HOME = SCRIPT_DIR + TEST_DIR;
        int nrows = subset ? 4 : 13;
        fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
        programArgs = new String[] { "-explain", "-stats", "-args", HOME + "input/" + DATASET, String.valueOf(nrows), output("R") };
        runTest(true, false, null, -1);
        // read input/output and compare
        FrameReader reader2 = parRead ? new FrameReaderTextCSVParallel(new CSVFileFormatProperties()) : new FrameReaderTextCSV(new CSVFileFormatProperties());
        FrameBlock fb2 = reader2.readFrameFromHDFS(output("R"), -1L, -1L);
        System.out.println(DataConverter.toString(fb2));
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    } finally {
        rtplatform = rtold;
        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
    }
}
Also used : RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) FrameReaderTextCSVParallel(org.apache.sysml.runtime.io.FrameReaderTextCSVParallel) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) FrameReaderTextCSV(org.apache.sysml.runtime.io.FrameReaderTextCSV) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) FrameReader(org.apache.sysml.runtime.io.FrameReader)

Aggregations

CSVFileFormatProperties (org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties)25 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)11 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)10 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)9 FrameReader (org.apache.sysml.runtime.io.FrameReader)8 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)7 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)6 LongWritable (org.apache.hadoop.io.LongWritable)4 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)4 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)4 IOException (java.io.IOException)3 Text (org.apache.hadoop.io.Text)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)3 BufferedWriter (java.io.BufferedWriter)2 OutputStreamWriter (java.io.OutputStreamWriter)2 ArrayList (java.util.ArrayList)2 ValueType (org.apache.sysml.parser.Expression.ValueType)2 LongFrameToLongWritableFrameFunction (org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction)2 ReaderBinaryBlock (org.apache.sysml.runtime.io.ReaderBinaryBlock)2