use of org.apache.sysml.runtime.io.MatrixWriter in project incubator-systemml by apache.
the class RunMRJobs method submitJob.
/**
* Submits an MR job instruction, without modifying any state of that instruction.
*
* @param inst instruction
* @return job status
*/
public static JobReturn submitJob(MRJobInstruction inst) {
JobReturn ret = new JobReturn();
MatrixObject[] inputMatrices = inst.getInputMatrices();
MatrixObject[] outputMatrices = inst.getOutputMatrices();
boolean execCP = false;
// Spawn MapReduce Jobs
try {
// replace all placeholders in all instructions with appropriate values
String rdInst = inst.getIv_randInstructions();
String rrInst = inst.getIv_recordReaderInstructions();
String mapInst = inst.getIv_instructionsInMapper();
String shuffleInst = inst.getIv_shuffleInstructions();
String aggInst = inst.getIv_aggInstructions();
String otherInst = inst.getIv_otherInstructions();
boolean jvmReuse = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.JVM_REUSE);
switch(inst.getJobType()) {
case GMR:
case GMRCELL:
ret = GMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), inst.getPartitioned(), inst.getPformats(), inst.getPsizes(), rrInst, mapInst, aggInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), jvmReuse, inst.getIv_resultIndices(), inst.getDimsUnknownFilePrefix(), inst.getOutputs(), inst.getOutputInfos());
break;
case DATAGEN:
if (ConfigurationManager.isDynamicRecompilation() && OptimizerUtils.ALLOW_RAND_JOB_RECOMPILE && DMLScript.rtplatform != RUNTIME_PLATFORM.HADOOP && Recompiler.checkCPDataGen(inst, rdInst)) {
ret = executeInMemoryDataGenOperations(inst, rdInst, outputMatrices);
Statistics.decrementNoOfExecutedMRJobs();
execCP = true;
} else {
ret = DataGenMR.runJob(inst, rdInst.split(Lop.INSTRUCTION_DELIMITOR), mapInst, aggInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getDimsUnknownFilePrefix(), inst.getOutputs(), inst.getOutputInfos());
}
break;
case CM_COV:
ret = CMCOVMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), mapInst, shuffleInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getOutputs(), inst.getOutputInfos());
break;
case GROUPED_AGG:
ret = GroupedAggMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), shuffleInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getDimsUnknownFilePrefix(), inst.getOutputs(), inst.getOutputInfos());
break;
case REBLOCK:
case CSV_REBLOCK:
if (ConfigurationManager.isDynamicRecompilation() && DMLScript.rtplatform != RUNTIME_PLATFORM.HADOOP && Recompiler.checkCPReblock(inst, inputMatrices)) {
ret = executeInMemoryReblockOperations(inst, shuffleInst, inputMatrices, outputMatrices);
Statistics.decrementNoOfExecutedMRJobs();
execCP = true;
} else {
// export dirty matrices to HDFS (initially deferred)
for (MatrixObject m : inputMatrices) {
if (m.isDirty())
m.exportData();
}
checkEmptyInputs(inst, inputMatrices);
if (inst.getJobType() == JobType.REBLOCK) {
ret = ReblockMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), getNNZ(inputMatrices), mapInst, shuffleInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), jvmReuse, inst.getIv_resultIndices(), inst.getOutputs(), inst.getOutputInfos());
} else if (inst.getJobType() == JobType.CSV_REBLOCK) {
ret = CSVReblockMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), shuffleInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getOutputs(), inst.getOutputInfos());
}
}
break;
case CSV_WRITE:
ret = WriteCSVMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBclens(), inst.getBclens(), shuffleInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getOutputs());
break;
case MMCJ:
ret = MMCJMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), mapInst, aggInst, shuffleInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getOutputs()[0], inst.getOutputInfos()[0]);
break;
case MMRJ:
ret = MMRJMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), mapInst, aggInst, shuffleInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getOutputs(), inst.getOutputInfos());
break;
case SORT:
boolean weightsflag = true;
if (!mapInst.equalsIgnoreCase(""))
weightsflag = false;
ret = SortMR.runJob(inst, inst.getInputs()[0], inst.getInputInfos()[0], inst.getRlens()[0], inst.getClens()[0], inst.getBrlens()[0], inst.getBclens()[0], mapInst, shuffleInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getOutputs()[0], inst.getOutputInfos()[0], weightsflag);
break;
case COMBINE:
ret = CombineMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), shuffleInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getOutputs(), inst.getOutputInfos());
break;
case DATA_PARTITION:
ret = DataPartitionMR.runJob(inst, inputMatrices, shuffleInst, inst.getIv_resultIndices(), outputMatrices, inst.getIv_numReducers(), inst.getIv_replication());
break;
default:
throw new DMLRuntimeException("Invalid jobtype: " + inst.getJobType());
}
}// end of try block
catch (Exception e) {
throw new DMLRuntimeException(e);
}
if (ret.checkReturnStatus()) {
/*
* Check if any output is empty. If yes, create a dummy file. Needs
* to be done only in case of (1) CellOutputInfo and if not CP, or
* (2) BinaryBlockOutputInfo if not CP and output empty blocks disabled.
*/
try {
if (!execCP) {
for (int i = 0; i < outputMatrices.length; i++) {
// get output meta data
MetaDataFormat meta = (MetaDataFormat) outputMatrices[i].getMetaData();
MatrixCharacteristics mc = meta.getMatrixCharacteristics();
OutputInfo outinfo = meta.getOutputInfo();
String fname = outputMatrices[i].getFileName();
if (MapReduceTool.isHDFSFileEmpty(fname)) {
// prepare output file
Path filepath = new Path(fname, "0-m-00000");
MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(outinfo);
writer.writeEmptyMatrixToHDFS(filepath.toString(), mc.getRows(), mc.getCols(), mc.getRowsPerBlock(), mc.getColsPerBlock());
}
outputMatrices[i].setHDFSFileExists(true);
if (inst.getJobType() != JobType.CSV_WRITE) {
// write out metadata file
// Currently, valueType information in not stored in MR instruction,
// since only DOUBLE matrices are supported ==> hard coded the value type information for now
MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, ret.getMetaData(i).getMatrixCharacteristics(), outinfo);
}
}
}
return ret;
} catch (IOException e) {
throw new DMLRuntimeException(e);
}
}
// should not come here!
throw new DMLRuntimeException("Unexpected Job Type: " + inst.getJobType());
}
use of org.apache.sysml.runtime.io.MatrixWriter in project incubator-systemml by apache.
the class JMLCInputStreamReadTest method runJMLCInputStreamReadTest.
private void runJMLCInputStreamReadTest(DataType dt, boolean sparse, String format, boolean metaData) throws IOException {
TestConfiguration config = getTestConfiguration(TEST_NAME);
loadTestConfiguration(config);
// generate inputs
OutputInfo oinfo = format.equals("csv") ? OutputInfo.CSVOutputInfo : OutputInfo.TextCellOutputInfo;
double[][] data = TestUtils.round(getRandomMatrix(rows, cols, 0.51, 7.49, sparse ? sparsity2 : sparsity1, 7));
Connection conn = new Connection();
try {
if (dt == DataType.MATRIX) {
// write input matrix
MatrixBlock mb = DataConverter.convertToMatrixBlock(data);
MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(oinfo);
writer.writeMatrixToHDFS(mb, output("X"), rows, cols, -1, -1, -1);
// read matrix from input stream
FileInputStream fis = new FileInputStream(output("X"));
double[][] data2 = conn.convertToDoubleMatrix(fis, rows, cols, format);
fis.close();
// compare matrix result
TestUtils.compareMatrices(data, data2, rows, cols, 0);
} else if (dt == DataType.FRAME) {
// write input frame
String[][] fdata = FrameTransformTest.createFrameData(data, "V");
// test quoted tokens w/ inner quotes
fdata[3][1] = "\"ab\"\"cdef\"";
if (format.equals("csv"))
// test delimiter and space tokens
fdata[7][2] = "\"a,bc def\"";
FrameBlock fb = DataConverter.convertToFrameBlock(fdata);
if (metaData) {
fb.setColumnNames(IntStream.range(0, cols).mapToObj(i -> "CC" + i).collect(Collectors.toList()).toArray(new String[0]));
}
FrameWriter writer = FrameWriterFactory.createFrameWriter(oinfo);
writer.writeFrameToHDFS(fb, output("X"), rows, cols);
// read frame from input stream
FileInputStream fis = new FileInputStream(output("X"));
String[][] fdata2 = conn.convertToStringFrame(fis, rows, cols, format);
fis.close();
// compare frame result
TestUtils.compareFrames(fdata, fdata2, rows, cols);
} else {
throw new IOException("Unsupported data type: " + dt.name());
}
} catch (Exception ex) {
throw new RuntimeException(ex);
} finally {
MapReduceTool.deleteFileIfExistOnHDFS(output("X"));
IOUtilFunctions.closeSilently(conn);
}
}
use of org.apache.sysml.runtime.io.MatrixWriter in project incubator-systemml by apache.
the class FrameMatrixCastingTest method writeMatrixOrFrameInput.
private static void writeMatrixOrFrameInput(String fname, double[][] A, int rows, int cols, DataType dt, ValueType vt) throws IOException {
int blksize = ConfigurationManager.getBlocksize();
// write input data
if (dt == DataType.FRAME) {
FrameBlock fb = DataConverter.convertToFrameBlock(DataConverter.convertToMatrixBlock(A), vt);
FrameWriter writer = FrameWriterFactory.createFrameWriter(OutputInfo.BinaryBlockOutputInfo);
writer.writeFrameToHDFS(fb, fname, rows, cols);
} else {
MatrixBlock mb = DataConverter.convertToMatrixBlock(A);
MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(OutputInfo.BinaryBlockOutputInfo);
writer.writeMatrixToHDFS(mb, fname, (long) rows, (long) cols, blksize, blksize, -1);
}
// write meta data
MatrixCharacteristics mc = new MatrixCharacteristics(rows, cols, blksize, blksize);
MapReduceTool.writeMetaDataFile(fname + ".mtd", vt, null, dt, mc, OutputInfo.BinaryBlockOutputInfo);
}
use of org.apache.sysml.runtime.io.MatrixWriter in project systemml by apache.
the class DataConverter method writeMatrixToHDFS.
public static void writeMatrixToHDFS(MatrixBlock mat, String dir, OutputInfo outputinfo, MatrixCharacteristics mc, int replication, FileFormatProperties formatProperties) throws IOException {
try {
MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(outputinfo, replication, formatProperties);
writer.writeMatrixToHDFS(mat, dir, mc.getRows(), mc.getCols(), mc.getRowsPerBlock(), mc.getColsPerBlock(), mc.getNonZeros());
} catch (Exception e) {
throw new IOException(e);
}
}
use of org.apache.sysml.runtime.io.MatrixWriter in project systemml by apache.
the class RunMRJobs method submitJob.
/**
* Submits an MR job instruction, without modifying any state of that instruction.
*
* @param inst instruction
* @return job status
*/
public static JobReturn submitJob(MRJobInstruction inst) {
JobReturn ret = new JobReturn();
MatrixObject[] inputMatrices = inst.getInputMatrices();
MatrixObject[] outputMatrices = inst.getOutputMatrices();
boolean execCP = false;
// Spawn MapReduce Jobs
try {
// replace all placeholders in all instructions with appropriate values
String rdInst = inst.getIv_randInstructions();
String rrInst = inst.getIv_recordReaderInstructions();
String mapInst = inst.getIv_instructionsInMapper();
String shuffleInst = inst.getIv_shuffleInstructions();
String aggInst = inst.getIv_aggInstructions();
String otherInst = inst.getIv_otherInstructions();
boolean jvmReuse = ConfigurationManager.getDMLConfig().getBooleanValue(DMLConfig.JVM_REUSE);
switch(inst.getJobType()) {
case GMR:
case GMRCELL:
ret = GMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), inst.getPartitioned(), inst.getPformats(), inst.getPsizes(), rrInst, mapInst, aggInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), jvmReuse, inst.getIv_resultIndices(), inst.getDimsUnknownFilePrefix(), inst.getOutputs(), inst.getOutputInfos());
break;
case DATAGEN:
if (ConfigurationManager.isDynamicRecompilation() && OptimizerUtils.ALLOW_RAND_JOB_RECOMPILE && DMLScript.rtplatform != RUNTIME_PLATFORM.HADOOP && Recompiler.checkCPDataGen(inst, rdInst)) {
ret = executeInMemoryDataGenOperations(inst, rdInst, outputMatrices);
Statistics.decrementNoOfExecutedMRJobs();
execCP = true;
} else {
ret = DataGenMR.runJob(inst, rdInst.split(Lop.INSTRUCTION_DELIMITOR), mapInst, aggInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getDimsUnknownFilePrefix(), inst.getOutputs(), inst.getOutputInfos());
}
break;
case CM_COV:
ret = CMCOVMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), mapInst, shuffleInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getOutputs(), inst.getOutputInfos());
break;
case GROUPED_AGG:
ret = GroupedAggMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), shuffleInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getDimsUnknownFilePrefix(), inst.getOutputs(), inst.getOutputInfos());
break;
case REBLOCK:
case CSV_REBLOCK:
if (ConfigurationManager.isDynamicRecompilation() && DMLScript.rtplatform != RUNTIME_PLATFORM.HADOOP && Recompiler.checkCPReblock(inst, inputMatrices)) {
ret = executeInMemoryReblockOperations(inst, shuffleInst, inputMatrices, outputMatrices);
Statistics.decrementNoOfExecutedMRJobs();
execCP = true;
} else {
// export dirty matrices to HDFS (initially deferred)
for (MatrixObject m : inputMatrices) {
if (m.isDirty())
m.exportData();
}
checkEmptyInputs(inst, inputMatrices);
if (inst.getJobType() == JobType.REBLOCK) {
ret = ReblockMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), getNNZ(inputMatrices), mapInst, shuffleInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), jvmReuse, inst.getIv_resultIndices(), inst.getOutputs(), inst.getOutputInfos());
} else if (inst.getJobType() == JobType.CSV_REBLOCK) {
ret = CSVReblockMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), shuffleInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getOutputs(), inst.getOutputInfos());
}
}
break;
case CSV_WRITE:
ret = WriteCSVMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBclens(), inst.getBclens(), shuffleInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getOutputs());
break;
case MMCJ:
ret = MMCJMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), mapInst, aggInst, shuffleInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getOutputs()[0], inst.getOutputInfos()[0]);
break;
case MMRJ:
ret = MMRJMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), mapInst, aggInst, shuffleInst, otherInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getOutputs(), inst.getOutputInfos());
break;
case SORT:
boolean weightsflag = true;
if (!mapInst.equalsIgnoreCase(""))
weightsflag = false;
ret = SortMR.runJob(inst, inst.getInputs()[0], inst.getInputInfos()[0], inst.getRlens()[0], inst.getClens()[0], inst.getBrlens()[0], inst.getBclens()[0], mapInst, shuffleInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getOutputs()[0], inst.getOutputInfos()[0], weightsflag);
break;
case COMBINE:
ret = CombineMR.runJob(inst, inst.getInputs(), inst.getInputInfos(), inst.getRlens(), inst.getClens(), inst.getBrlens(), inst.getBclens(), shuffleInst, inst.getIv_numReducers(), inst.getIv_replication(), inst.getIv_resultIndices(), inst.getOutputs(), inst.getOutputInfos());
break;
case DATA_PARTITION:
ret = DataPartitionMR.runJob(inst, inputMatrices, shuffleInst, inst.getIv_resultIndices(), outputMatrices, inst.getIv_numReducers(), inst.getIv_replication());
break;
default:
throw new DMLRuntimeException("Invalid jobtype: " + inst.getJobType());
}
}// end of try block
catch (Exception e) {
throw new DMLRuntimeException(e);
}
if (ret.checkReturnStatus()) {
/*
* Check if any output is empty. If yes, create a dummy file. Needs
* to be done only in case of (1) CellOutputInfo and if not CP, or
* (2) BinaryBlockOutputInfo if not CP and output empty blocks disabled.
*/
try {
if (!execCP) {
for (int i = 0; i < outputMatrices.length; i++) {
// get output meta data
MetaDataFormat meta = (MetaDataFormat) outputMatrices[i].getMetaData();
MatrixCharacteristics mc = meta.getMatrixCharacteristics();
OutputInfo outinfo = meta.getOutputInfo();
String fname = outputMatrices[i].getFileName();
if (MapReduceTool.isHDFSFileEmpty(fname)) {
// prepare output file
Path filepath = new Path(fname, "0-m-00000");
MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(outinfo);
writer.writeEmptyMatrixToHDFS(filepath.toString(), mc.getRows(), mc.getCols(), mc.getRowsPerBlock(), mc.getColsPerBlock());
}
outputMatrices[i].setHDFSFileExists(true);
if (inst.getJobType() != JobType.CSV_WRITE) {
// write out metadata file
// Currently, valueType information in not stored in MR instruction,
// since only DOUBLE matrices are supported ==> hard coded the value type information for now
MapReduceTool.writeMetaDataFile(fname + ".mtd", ValueType.DOUBLE, ret.getMetaData(i).getMatrixCharacteristics(), outinfo);
}
}
}
return ret;
} catch (IOException e) {
throw new DMLRuntimeException(e);
}
}
// should not come here!
throw new DMLRuntimeException("Unexpected Job Type: " + inst.getJobType());
}
Aggregations