Search in sources :

Example 21 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class VariableCPInstruction method parseInstruction.

public static VariableCPInstruction parseInstruction(String str) {
    String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
    String opcode = parts[0];
    VariableOperationCode voc = getVariableOperationCode(opcode);
    if (voc == VariableOperationCode.CreateVariable) {
        if (// && parts.length != 10 )
        parts.length < 5)
            throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
    } else if (voc == VariableOperationCode.MoveVariable) {
        // mvvar tempA A; or mvvar mvar5 "data/out.mtx" "binary"
        if (parts.length != 3 && parts.length != 4)
            throw new DMLRuntimeException("Invalid number of operands in mvvar instruction: " + str);
    } else if (voc == VariableOperationCode.Write) {
        // Write instructions for csv files also include three additional parameters (hasHeader, delimiter, sparse)
        if (parts.length != 5 && parts.length != 8)
            throw new DMLRuntimeException("Invalid number of operands in write instruction: " + str);
    } else {
        if (voc != VariableOperationCode.RemoveVariable)
            // no output
            InstructionUtils.checkNumFields(parts, getArity(voc));
    }
    CPOperand in1 = null, in2 = null, in3 = null, in4 = null, out = null;
    switch(voc) {
        case CreateVariable:
            // variable name
            DataType dt = DataType.valueOf(parts[4]);
            ValueType vt = dt == DataType.MATRIX ? ValueType.DOUBLE : ValueType.STRING;
            int extSchema = (dt == DataType.FRAME && parts.length >= 13) ? 1 : 0;
            in1 = new CPOperand(parts[1], vt, dt);
            // file name
            in2 = new CPOperand(parts[2], ValueType.STRING, DataType.SCALAR);
            // file name override flag (always literal)
            in3 = new CPOperand(parts[3], ValueType.BOOLEAN, DataType.SCALAR);
            // format
            String fmt = parts[5];
            if (fmt.equalsIgnoreCase("csv")) {
                // 14 inputs: createvar corresponding to READ -- includes properties hasHeader, delim, fill, and fillValue
                if (parts.length < 15 + extSchema || parts.length > 17 + extSchema)
                    throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
            } else {
                if (parts.length != 6 && parts.length != 12 + extSchema)
                    throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
            }
            OutputInfo oi = OutputInfo.stringToOutputInfo(fmt);
            InputInfo ii = OutputInfo.getMatchingInputInfo(oi);
            MatrixCharacteristics mc = new MatrixCharacteristics();
            if (parts.length == 6) {
            // do nothing
            } else if (parts.length >= 11) {
                // matrix characteristics
                mc.setDimension(Long.parseLong(parts[6]), Long.parseLong(parts[7]));
                mc.setBlockSize(Integer.parseInt(parts[8]), Integer.parseInt(parts[9]));
                mc.setNonZeros(Long.parseLong(parts[10]));
            } else {
                throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
            }
            MetaDataFormat iimd = new MetaDataFormat(mc, oi, ii);
            UpdateType updateType = UpdateType.COPY;
            if (parts.length >= 12)
                updateType = UpdateType.valueOf(parts[11].toUpperCase());
            // handle frame schema
            String schema = (dt == DataType.FRAME && parts.length >= 13) ? parts[parts.length - 1] : null;
            if (fmt.equalsIgnoreCase("csv")) {
                // Cretevar instructions for CSV format either has 13 or 14 inputs.
                // 13 inputs: createvar corresponding to WRITE -- includes properties hasHeader, delim, and sparse
                // 14 inputs: createvar corresponding to READ -- includes properties hasHeader, delim, fill, and fillValue
                FileFormatProperties fmtProperties = null;
                if (parts.length == 15 + extSchema) {
                    boolean hasHeader = Boolean.parseBoolean(parts[12]);
                    String delim = parts[13];
                    boolean sparse = Boolean.parseBoolean(parts[14]);
                    fmtProperties = new CSVFileFormatProperties(hasHeader, delim, sparse);
                } else {
                    boolean hasHeader = Boolean.parseBoolean(parts[12]);
                    String delim = parts[13];
                    boolean fill = Boolean.parseBoolean(parts[14]);
                    double fillValue = UtilFunctions.parseToDouble(parts[15]);
                    String naStrings = null;
                    if (parts.length == 17 + extSchema)
                        naStrings = parts[16];
                    fmtProperties = new CSVFileFormatProperties(hasHeader, delim, fill, fillValue, naStrings);
                }
                return new VariableCPInstruction(VariableOperationCode.CreateVariable, in1, in2, in3, iimd, updateType, fmtProperties, schema, opcode, str);
            } else {
                return new VariableCPInstruction(VariableOperationCode.CreateVariable, in1, in2, in3, iimd, updateType, schema, opcode, str);
            }
        case AssignVariable:
            in1 = new CPOperand(parts[1]);
            in2 = new CPOperand(parts[2]);
            break;
        case CopyVariable:
            // Value types are not given here
            in1 = new CPOperand(parts[1], ValueType.UNKNOWN, DataType.UNKNOWN);
            in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
            break;
        case MoveVariable:
            in1 = new CPOperand(parts[1], ValueType.UNKNOWN, DataType.UNKNOWN);
            in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
            if (parts.length > 3)
                in3 = new CPOperand(parts[3], ValueType.UNKNOWN, DataType.UNKNOWN);
            break;
        case RemoveVariable:
            VariableCPInstruction rminst = new VariableCPInstruction(getVariableOperationCode(opcode), null, null, null, out, opcode, str);
            for (int i = 1; i < parts.length; i++) rminst.addInput(new CPOperand(parts[i], ValueType.UNKNOWN, DataType.SCALAR));
            return rminst;
        case RemoveVariableAndFile:
            in1 = new CPOperand(parts[1]);
            in2 = new CPOperand(parts[2]);
            // second argument must be a boolean
            if (in2.getValueType() != ValueType.BOOLEAN)
                throw new DMLRuntimeException("Unexpected value type for second argument in: " + str);
            break;
        case CastAsScalarVariable:
        case CastAsMatrixVariable:
        case CastAsFrameVariable:
        case CastAsDoubleVariable:
        case CastAsIntegerVariable:
        case CastAsBooleanVariable:
            // first operand is a variable name => string value type
            in1 = new CPOperand(parts[1]);
            // output variable name
            out = new CPOperand(parts[2]);
            break;
        case Write:
            in1 = new CPOperand(parts[1]);
            in2 = new CPOperand(parts[2]);
            in3 = new CPOperand(parts[3]);
            FileFormatProperties fprops = null;
            if (in3.getName().equalsIgnoreCase("csv")) {
                boolean hasHeader = Boolean.parseBoolean(parts[4]);
                String delim = parts[5];
                boolean sparse = Boolean.parseBoolean(parts[6]);
                fprops = new CSVFileFormatProperties(hasHeader, delim, sparse);
                // description
                in4 = new CPOperand(parts[7]);
            } else {
                fprops = new FileFormatProperties();
                // description
                in4 = new CPOperand(parts[4]);
            }
            VariableCPInstruction inst = new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, out, null, fprops, null, null, opcode, str);
            inst.addInput(in4);
            return inst;
        case Read:
            in1 = new CPOperand(parts[1]);
            in2 = new CPOperand(parts[2]);
            out = null;
            break;
        case SetFileName:
            // variable name
            in1 = new CPOperand(parts[1]);
            // file name
            in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
            // option: remote or local
            in3 = new CPOperand(parts[3], ValueType.UNKNOWN, DataType.UNKNOWN);
            // return new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, str);
            break;
    }
    return new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, out, opcode, str);
}
Also used : MetaDataFormat(org.apache.sysml.runtime.matrix.MetaDataFormat) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) ValueType(org.apache.sysml.parser.Expression.ValueType) UpdateType(org.apache.sysml.runtime.controlprogram.caching.MatrixObject.UpdateType) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) FileFormatProperties(org.apache.sysml.runtime.matrix.data.FileFormatProperties) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) DataType(org.apache.sysml.parser.Expression.DataType)

Example 22 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class ReblockSPInstruction method processFrameReblockInstruction.

@SuppressWarnings("unchecked")
protected void processFrameReblockInstruction(SparkExecutionContext sec, InputInfo iinfo) {
    FrameObject fo = sec.getFrameObject(input1.getName());
    MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
    if (iinfo == InputInfo.TextCellInputInfo) {
        // get the input textcell rdd
        JavaPairRDD<LongWritable, Text> lines = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
        // convert textcell to binary block
        JavaPairRDD<Long, FrameBlock> out = FrameRDDConverterUtils.textCellToBinaryBlock(sec.getSparkContext(), lines, mcOut, fo.getSchema());
        // put output RDD handle into symbol table
        sec.setRDDHandleForVariable(output.getName(), out);
        sec.addLineageRDD(output.getName(), input1.getName());
    } else if (iinfo == InputInfo.CSVInputInfo) {
        // HACK ALERT: Until we introduces the rewrite to insert csvrblock for non-persistent read
        // throw new DMLRuntimeException("CSVInputInfo is not supported for ReblockSPInstruction");
        CSVReblockSPInstruction csvInstruction = null;
        boolean hasHeader = false;
        String delim = ",";
        boolean fill = false;
        double fillValue = 0;
        if (fo.getFileFormatProperties() instanceof CSVFileFormatProperties && fo.getFileFormatProperties() != null) {
            CSVFileFormatProperties props = (CSVFileFormatProperties) fo.getFileFormatProperties();
            hasHeader = props.hasHeader();
            delim = props.getDelim();
            fill = props.isFill();
            fillValue = props.getFillValue();
        }
        csvInstruction = new CSVReblockSPInstruction(null, input1, output, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), hasHeader, delim, fill, fillValue, "csvrblk", instString);
        csvInstruction.processInstruction(sec);
    } else {
        throw new DMLRuntimeException("The given InputInfo is not implemented " + "for ReblockSPInstruction: " + InputInfo.inputInfoToString(iinfo));
    }
}
Also used : CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) FrameObject(org.apache.sysml.runtime.controlprogram.caching.FrameObject) Text(org.apache.hadoop.io.Text) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritable(org.apache.hadoop.io.LongWritable)

Example 23 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class WriteSPInstruction method processFrameWriteInstruction.

@SuppressWarnings("unchecked")
protected void processFrameWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi, ValueType[] schema) throws IOException {
    // get input rdd
    JavaPairRDD<Long, FrameBlock> in1 = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
    MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
    if (oi == OutputInfo.TextCellOutputInfo) {
        JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToTextCell(in1, mc);
        customSaveTextFile(out, fname, false);
    } else if (oi == OutputInfo.CSVOutputInfo) {
        CSVFileFormatProperties props = (formatProperties != null) ? (CSVFileFormatProperties) formatProperties : null;
        JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToCsv(in1, mc, props, true);
        customSaveTextFile(out, fname, false);
    } else if (oi == OutputInfo.BinaryBlockOutputInfo) {
        JavaPairRDD<LongWritable, FrameBlock> out = in1.mapToPair(new LongFrameToLongWritableFrameFunction());
        out.saveAsHadoopFile(fname, LongWritable.class, FrameBlock.class, SequenceFileOutputFormat.class);
    } else {
        // unsupported formats: binarycell (not externalized)
        throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
    }
    // write meta data file
    MapReduceTool.writeMetaDataFile(fname + ".mtd", input1.getValueType(), schema, DataType.FRAME, mc, oi, formatProperties);
}
Also used : CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) JavaRDD(org.apache.spark.api.java.JavaRDD) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) LongWritable(org.apache.hadoop.io.LongWritable) LongFrameToLongWritableFrameFunction(org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction)

Example 24 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class FrameWriterTextCSV method writeCSVFrameToFile.

protected static void writeCSVFrameToFile(Path path, JobConf job, FileSystem fs, FrameBlock src, int rl, int ru, CSVFileFormatProperties props) throws IOException {
    // create buffered writer
    BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
    int cols = src.getNumColumns();
    try {
        // for obj reuse and preventing repeated buffer re-allocations
        StringBuilder sb = new StringBuilder();
        props = (props == null) ? new CSVFileFormatProperties() : props;
        String delim = props.getDelim();
        // Write header line, if needed
        if (rl == 0) {
            // append column names if header requested
            if (props.hasHeader()) {
                for (int j = 0; j < cols; j++) {
                    sb.append(src.getColumnNames()[j]);
                    if (j < cols - 1)
                        sb.append(delim);
                }
                sb.append('\n');
            }
            // append meta data
            if (!src.isColumnMetadataDefault()) {
                sb.append(TfUtils.TXMTD_MVPREFIX + delim);
                for (int j = 0; j < cols; j++) sb.append(src.getColumnMetadata(j).getMvValue() + ((j < cols - 1) ? delim : ""));
                sb.append("\n");
                sb.append(TfUtils.TXMTD_NDPREFIX + delim);
                for (int j = 0; j < cols; j++) sb.append(src.getColumnMetadata(j).getNumDistinct() + ((j < cols - 1) ? delim : ""));
                sb.append("\n");
            }
            br.write(sb.toString());
            sb.setLength(0);
        }
        // Write data lines
        Iterator<String[]> iter = src.getStringRowIterator(rl, ru);
        while (iter.hasNext()) {
            // write row chunk-wise to prevent OOM on large number of columns
            String[] row = iter.next();
            for (int bj = 0; bj < cols; bj += BLOCKSIZE_J) {
                for (int j = bj; j < Math.min(cols, bj + BLOCKSIZE_J); j++) {
                    if (row[j] != null)
                        sb.append(row[j]);
                    if (j != cols - 1)
                        sb.append(delim);
                }
                br.write(sb.toString());
                sb.setLength(0);
            }
            sb.append('\n');
            br.write(sb.toString());
            sb.setLength(0);
        }
    } finally {
        IOUtilFunctions.closeSilently(br);
    }
}
Also used : CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) OutputStreamWriter(java.io.OutputStreamWriter) BufferedWriter(java.io.BufferedWriter)

Example 25 with CSVFileFormatProperties

use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.

the class TransformReadMetaTest method runTransformReadMetaTest.

/**
	 * 
	 * @param sparseM1
	 * @param sparseM2
	 * @param instType
	 * @throws IOException 
	 * @throws DMLRuntimeException 
	 */
private void runTransformReadMetaTest(RUNTIME_PLATFORM rt, String ofmt, String delim) throws IOException, DMLRuntimeException {
    RUNTIME_PLATFORM platformOld = rtplatform;
    rtplatform = rt;
    boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
    if (rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)
        DMLScript.USE_LOCAL_SPARK_CONFIG = true;
    try {
        String testname = delim.equals(",") ? TEST_NAME1 : TEST_NAME2;
        getAndLoadTestConfiguration(testname);
        //generate input data
        double[][] X = DataConverter.convertToDoubleMatrix(MatrixBlock.seqOperations(0.5, rows / 2, 0.5).appendOperations(MatrixBlock.seqOperations(0.5, rows / 2, 0.5), new MatrixBlock()));
        MatrixBlock mbX = DataConverter.convertToMatrixBlock(X);
        CSVFileFormatProperties fprops = new CSVFileFormatProperties(false, delim, false);
        MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(OutputInfo.CSVOutputInfo, 1, fprops);
        writer.writeMatrixToHDFS(mbX, input("X"), rows, 2, -1, -1, -1);
        //read specs transform X and Y
        String specX = MapReduceTool.readStringFromHDFSFile(SCRIPT_DIR + TEST_DIR + SPEC_X);
        fullDMLScriptName = SCRIPT_DIR + TEST_DIR + testname + ".dml";
        programArgs = new String[] { "-args", input("X"), specX, output("M1"), output("M"), ofmt, delim };
        //run test
        runTest(true, false, null, -1);
        //compare meta data frames
        InputInfo iinfo = InputInfo.stringExternalToInputInfo(ofmt);
        FrameReader reader = FrameReaderFactory.createFrameReader(iinfo);
        FrameBlock mExpected = TfMetaUtils.readTransformMetaDataFromFile(specX, output("M1"), delim);
        FrameBlock mRet = reader.readFrameFromHDFS(output("M"), rows, 2);
        for (int i = 0; i < rows; i++) for (int j = 0; j < 2; j++) {
            Assert.assertTrue("Wrong result: " + mRet.get(i, j) + ".", UtilFunctions.compareTo(ValueType.STRING, mExpected.get(i, j), mRet.get(i, j)) == 0);
        }
    } catch (Exception ex) {
        throw new IOException(ex);
    } finally {
        rtplatform = platformOld;
        DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
    }
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) CSVFileFormatProperties(org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) RUNTIME_PLATFORM(org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) FrameBlock(org.apache.sysml.runtime.matrix.data.FrameBlock) FrameReader(org.apache.sysml.runtime.io.FrameReader) MatrixWriter(org.apache.sysml.runtime.io.MatrixWriter)

Aggregations

CSVFileFormatProperties (org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties)25 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)11 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)10 RUNTIME_PLATFORM (org.apache.sysml.api.DMLScript.RUNTIME_PLATFORM)9 FrameReader (org.apache.sysml.runtime.io.FrameReader)8 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)7 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)6 LongWritable (org.apache.hadoop.io.LongWritable)4 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)4 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)4 IOException (java.io.IOException)3 Text (org.apache.hadoop.io.Text)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)3 BufferedWriter (java.io.BufferedWriter)2 OutputStreamWriter (java.io.OutputStreamWriter)2 ArrayList (java.util.ArrayList)2 ValueType (org.apache.sysml.parser.Expression.ValueType)2 LongFrameToLongWritableFrameFunction (org.apache.sysml.runtime.instructions.spark.utils.FrameRDDConverterUtils.LongFrameToLongWritableFrameFunction)2 ReaderBinaryBlock (org.apache.sysml.runtime.io.ReaderBinaryBlock)2