use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.
the class VariableCPInstruction method parseInstruction.
public static VariableCPInstruction parseInstruction(String str) {
String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
String opcode = parts[0];
VariableOperationCode voc = getVariableOperationCode(opcode);
if (voc == VariableOperationCode.CreateVariable) {
if (// && parts.length != 10 )
parts.length < 5)
throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
} else if (voc == VariableOperationCode.MoveVariable) {
// mvvar tempA A; or mvvar mvar5 "data/out.mtx" "binary"
if (parts.length != 3 && parts.length != 4)
throw new DMLRuntimeException("Invalid number of operands in mvvar instruction: " + str);
} else if (voc == VariableOperationCode.Write) {
// Write instructions for csv files also include three additional parameters (hasHeader, delimiter, sparse)
if (parts.length != 5 && parts.length != 8)
throw new DMLRuntimeException("Invalid number of operands in write instruction: " + str);
} else {
if (voc != VariableOperationCode.RemoveVariable)
// no output
InstructionUtils.checkNumFields(parts, getArity(voc));
}
CPOperand in1 = null, in2 = null, in3 = null, in4 = null, out = null;
switch(voc) {
case CreateVariable:
// variable name
DataType dt = DataType.valueOf(parts[4]);
ValueType vt = dt == DataType.MATRIX ? ValueType.DOUBLE : ValueType.STRING;
int extSchema = (dt == DataType.FRAME && parts.length >= 13) ? 1 : 0;
in1 = new CPOperand(parts[1], vt, dt);
// file name
in2 = new CPOperand(parts[2], ValueType.STRING, DataType.SCALAR);
// file name override flag (always literal)
in3 = new CPOperand(parts[3], ValueType.BOOLEAN, DataType.SCALAR);
// format
String fmt = parts[5];
if (fmt.equalsIgnoreCase("csv")) {
// 14 inputs: createvar corresponding to READ -- includes properties hasHeader, delim, fill, and fillValue
if (parts.length < 15 + extSchema || parts.length > 17 + extSchema)
throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
} else {
if (parts.length != 6 && parts.length != 12 + extSchema)
throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
}
OutputInfo oi = OutputInfo.stringToOutputInfo(fmt);
InputInfo ii = OutputInfo.getMatchingInputInfo(oi);
MatrixCharacteristics mc = new MatrixCharacteristics();
if (parts.length == 6) {
// do nothing
} else if (parts.length >= 11) {
// matrix characteristics
mc.setDimension(Long.parseLong(parts[6]), Long.parseLong(parts[7]));
mc.setBlockSize(Integer.parseInt(parts[8]), Integer.parseInt(parts[9]));
mc.setNonZeros(Long.parseLong(parts[10]));
} else {
throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
}
MetaDataFormat iimd = new MetaDataFormat(mc, oi, ii);
UpdateType updateType = UpdateType.COPY;
if (parts.length >= 12)
updateType = UpdateType.valueOf(parts[11].toUpperCase());
// handle frame schema
String schema = (dt == DataType.FRAME && parts.length >= 13) ? parts[parts.length - 1] : null;
if (fmt.equalsIgnoreCase("csv")) {
// Cretevar instructions for CSV format either has 13 or 14 inputs.
// 13 inputs: createvar corresponding to WRITE -- includes properties hasHeader, delim, and sparse
// 14 inputs: createvar corresponding to READ -- includes properties hasHeader, delim, fill, and fillValue
FileFormatProperties fmtProperties = null;
if (parts.length == 15 + extSchema) {
boolean hasHeader = Boolean.parseBoolean(parts[12]);
String delim = parts[13];
boolean sparse = Boolean.parseBoolean(parts[14]);
fmtProperties = new CSVFileFormatProperties(hasHeader, delim, sparse);
} else {
boolean hasHeader = Boolean.parseBoolean(parts[12]);
String delim = parts[13];
boolean fill = Boolean.parseBoolean(parts[14]);
double fillValue = UtilFunctions.parseToDouble(parts[15]);
String naStrings = null;
if (parts.length == 17 + extSchema)
naStrings = parts[16];
fmtProperties = new CSVFileFormatProperties(hasHeader, delim, fill, fillValue, naStrings);
}
return new VariableCPInstruction(VariableOperationCode.CreateVariable, in1, in2, in3, iimd, updateType, fmtProperties, schema, opcode, str);
} else {
return new VariableCPInstruction(VariableOperationCode.CreateVariable, in1, in2, in3, iimd, updateType, schema, opcode, str);
}
case AssignVariable:
in1 = new CPOperand(parts[1]);
in2 = new CPOperand(parts[2]);
break;
case CopyVariable:
// Value types are not given here
in1 = new CPOperand(parts[1], ValueType.UNKNOWN, DataType.UNKNOWN);
in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
break;
case MoveVariable:
in1 = new CPOperand(parts[1], ValueType.UNKNOWN, DataType.UNKNOWN);
in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
if (parts.length > 3)
in3 = new CPOperand(parts[3], ValueType.UNKNOWN, DataType.UNKNOWN);
break;
case RemoveVariable:
VariableCPInstruction rminst = new VariableCPInstruction(getVariableOperationCode(opcode), null, null, null, out, opcode, str);
for (int i = 1; i < parts.length; i++) rminst.addInput(new CPOperand(parts[i], ValueType.UNKNOWN, DataType.SCALAR));
return rminst;
case RemoveVariableAndFile:
in1 = new CPOperand(parts[1]);
in2 = new CPOperand(parts[2]);
// second argument must be a boolean
if (in2.getValueType() != ValueType.BOOLEAN)
throw new DMLRuntimeException("Unexpected value type for second argument in: " + str);
break;
case CastAsScalarVariable:
case CastAsMatrixVariable:
case CastAsFrameVariable:
case CastAsDoubleVariable:
case CastAsIntegerVariable:
case CastAsBooleanVariable:
// first operand is a variable name => string value type
in1 = new CPOperand(parts[1]);
// output variable name
out = new CPOperand(parts[2]);
break;
case Write:
in1 = new CPOperand(parts[1]);
in2 = new CPOperand(parts[2]);
in3 = new CPOperand(parts[3]);
FileFormatProperties fprops = null;
if (in3.getName().equalsIgnoreCase("csv")) {
boolean hasHeader = Boolean.parseBoolean(parts[4]);
String delim = parts[5];
boolean sparse = Boolean.parseBoolean(parts[6]);
fprops = new CSVFileFormatProperties(hasHeader, delim, sparse);
// description
in4 = new CPOperand(parts[7]);
} else {
fprops = new FileFormatProperties();
// description
in4 = new CPOperand(parts[4]);
}
VariableCPInstruction inst = new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, out, null, fprops, null, null, opcode, str);
inst.addInput(in4);
return inst;
case Read:
in1 = new CPOperand(parts[1]);
in2 = new CPOperand(parts[2]);
out = null;
break;
case SetFileName:
// variable name
in1 = new CPOperand(parts[1]);
// file name
in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
// option: remote or local
in3 = new CPOperand(parts[3], ValueType.UNKNOWN, DataType.UNKNOWN);
// return new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, str);
break;
}
return new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, out, opcode, str);
}
use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.
the class ReblockSPInstruction method processFrameReblockInstruction.
@SuppressWarnings("unchecked")
protected void processFrameReblockInstruction(SparkExecutionContext sec, InputInfo iinfo) {
FrameObject fo = sec.getFrameObject(input1.getName());
MatrixCharacteristics mcOut = sec.getMatrixCharacteristics(output.getName());
if (iinfo == InputInfo.TextCellInputInfo) {
// get the input textcell rdd
JavaPairRDD<LongWritable, Text> lines = (JavaPairRDD<LongWritable, Text>) sec.getRDDHandleForVariable(input1.getName(), iinfo);
// convert textcell to binary block
JavaPairRDD<Long, FrameBlock> out = FrameRDDConverterUtils.textCellToBinaryBlock(sec.getSparkContext(), lines, mcOut, fo.getSchema());
// put output RDD handle into symbol table
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
} else if (iinfo == InputInfo.CSVInputInfo) {
// HACK ALERT: Until we introduces the rewrite to insert csvrblock for non-persistent read
// throw new DMLRuntimeException("CSVInputInfo is not supported for ReblockSPInstruction");
CSVReblockSPInstruction csvInstruction = null;
boolean hasHeader = false;
String delim = ",";
boolean fill = false;
double fillValue = 0;
if (fo.getFileFormatProperties() instanceof CSVFileFormatProperties && fo.getFileFormatProperties() != null) {
CSVFileFormatProperties props = (CSVFileFormatProperties) fo.getFileFormatProperties();
hasHeader = props.hasHeader();
delim = props.getDelim();
fill = props.isFill();
fillValue = props.getFillValue();
}
csvInstruction = new CSVReblockSPInstruction(null, input1, output, mcOut.getRowsPerBlock(), mcOut.getColsPerBlock(), hasHeader, delim, fill, fillValue, "csvrblk", instString);
csvInstruction.processInstruction(sec);
} else {
throw new DMLRuntimeException("The given InputInfo is not implemented " + "for ReblockSPInstruction: " + InputInfo.inputInfoToString(iinfo));
}
}
use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.
the class WriteSPInstruction method processFrameWriteInstruction.
@SuppressWarnings("unchecked")
protected void processFrameWriteInstruction(SparkExecutionContext sec, String fname, OutputInfo oi, ValueType[] schema) throws IOException {
// get input rdd
JavaPairRDD<Long, FrameBlock> in1 = (JavaPairRDD<Long, FrameBlock>) sec.getRDDHandleForVariable(input1.getName(), InputInfo.BinaryBlockInputInfo);
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
if (oi == OutputInfo.TextCellOutputInfo) {
JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToTextCell(in1, mc);
customSaveTextFile(out, fname, false);
} else if (oi == OutputInfo.CSVOutputInfo) {
CSVFileFormatProperties props = (formatProperties != null) ? (CSVFileFormatProperties) formatProperties : null;
JavaRDD<String> out = FrameRDDConverterUtils.binaryBlockToCsv(in1, mc, props, true);
customSaveTextFile(out, fname, false);
} else if (oi == OutputInfo.BinaryBlockOutputInfo) {
JavaPairRDD<LongWritable, FrameBlock> out = in1.mapToPair(new LongFrameToLongWritableFrameFunction());
out.saveAsHadoopFile(fname, LongWritable.class, FrameBlock.class, SequenceFileOutputFormat.class);
} else {
// unsupported formats: binarycell (not externalized)
throw new DMLRuntimeException("Unexpected data format: " + OutputInfo.outputInfoToString(oi));
}
// write meta data file
MapReduceTool.writeMetaDataFile(fname + ".mtd", input1.getValueType(), schema, DataType.FRAME, mc, oi, formatProperties);
}
use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.
the class FrameWriterTextCSV method writeCSVFrameToFile.
protected static void writeCSVFrameToFile(Path path, JobConf job, FileSystem fs, FrameBlock src, int rl, int ru, CSVFileFormatProperties props) throws IOException {
// create buffered writer
BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(path, true)));
int cols = src.getNumColumns();
try {
// for obj reuse and preventing repeated buffer re-allocations
StringBuilder sb = new StringBuilder();
props = (props == null) ? new CSVFileFormatProperties() : props;
String delim = props.getDelim();
// Write header line, if needed
if (rl == 0) {
// append column names if header requested
if (props.hasHeader()) {
for (int j = 0; j < cols; j++) {
sb.append(src.getColumnNames()[j]);
if (j < cols - 1)
sb.append(delim);
}
sb.append('\n');
}
// append meta data
if (!src.isColumnMetadataDefault()) {
sb.append(TfUtils.TXMTD_MVPREFIX + delim);
for (int j = 0; j < cols; j++) sb.append(src.getColumnMetadata(j).getMvValue() + ((j < cols - 1) ? delim : ""));
sb.append("\n");
sb.append(TfUtils.TXMTD_NDPREFIX + delim);
for (int j = 0; j < cols; j++) sb.append(src.getColumnMetadata(j).getNumDistinct() + ((j < cols - 1) ? delim : ""));
sb.append("\n");
}
br.write(sb.toString());
sb.setLength(0);
}
// Write data lines
Iterator<String[]> iter = src.getStringRowIterator(rl, ru);
while (iter.hasNext()) {
// write row chunk-wise to prevent OOM on large number of columns
String[] row = iter.next();
for (int bj = 0; bj < cols; bj += BLOCKSIZE_J) {
for (int j = bj; j < Math.min(cols, bj + BLOCKSIZE_J); j++) {
if (row[j] != null)
sb.append(row[j]);
if (j != cols - 1)
sb.append(delim);
}
br.write(sb.toString());
sb.setLength(0);
}
sb.append('\n');
br.write(sb.toString());
sb.setLength(0);
}
} finally {
IOUtilFunctions.closeSilently(br);
}
}
use of org.apache.sysml.runtime.matrix.data.CSVFileFormatProperties in project incubator-systemml by apache.
the class TransformReadMetaTest method runTransformReadMetaTest.
/**
*
* @param sparseM1
* @param sparseM2
* @param instType
* @throws IOException
* @throws DMLRuntimeException
*/
private void runTransformReadMetaTest(RUNTIME_PLATFORM rt, String ofmt, String delim) throws IOException, DMLRuntimeException {
RUNTIME_PLATFORM platformOld = rtplatform;
rtplatform = rt;
boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
if (rtplatform == RUNTIME_PLATFORM.SPARK || rtplatform == RUNTIME_PLATFORM.HYBRID_SPARK)
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
try {
String testname = delim.equals(",") ? TEST_NAME1 : TEST_NAME2;
getAndLoadTestConfiguration(testname);
//generate input data
double[][] X = DataConverter.convertToDoubleMatrix(MatrixBlock.seqOperations(0.5, rows / 2, 0.5).appendOperations(MatrixBlock.seqOperations(0.5, rows / 2, 0.5), new MatrixBlock()));
MatrixBlock mbX = DataConverter.convertToMatrixBlock(X);
CSVFileFormatProperties fprops = new CSVFileFormatProperties(false, delim, false);
MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(OutputInfo.CSVOutputInfo, 1, fprops);
writer.writeMatrixToHDFS(mbX, input("X"), rows, 2, -1, -1, -1);
//read specs transform X and Y
String specX = MapReduceTool.readStringFromHDFSFile(SCRIPT_DIR + TEST_DIR + SPEC_X);
fullDMLScriptName = SCRIPT_DIR + TEST_DIR + testname + ".dml";
programArgs = new String[] { "-args", input("X"), specX, output("M1"), output("M"), ofmt, delim };
//run test
runTest(true, false, null, -1);
//compare meta data frames
InputInfo iinfo = InputInfo.stringExternalToInputInfo(ofmt);
FrameReader reader = FrameReaderFactory.createFrameReader(iinfo);
FrameBlock mExpected = TfMetaUtils.readTransformMetaDataFromFile(specX, output("M1"), delim);
FrameBlock mRet = reader.readFrameFromHDFS(output("M"), rows, 2);
for (int i = 0; i < rows; i++) for (int j = 0; j < 2; j++) {
Assert.assertTrue("Wrong result: " + mRet.get(i, j) + ".", UtilFunctions.compareTo(ValueType.STRING, mExpected.get(i, j), mRet.get(i, j)) == 0);
}
} catch (Exception ex) {
throw new IOException(ex);
} finally {
rtplatform = platformOld;
DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
}
}
Aggregations