use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class Dag method generateMapReduceInstructions.
/**
* Method to generate MapReduce job instructions from a given set of nodes.
*
* @param execNodes list of exec nodes
* @param inst list of instructions
* @param writeinst list of write instructions
* @param deleteinst list of delete instructions
* @param rmvarinst list of rmvar instructions
* @param jt job type
*/
private void generateMapReduceInstructions(ArrayList<Lop> execNodes, ArrayList<Instruction> inst, ArrayList<Instruction> writeinst, ArrayList<Instruction> deleteinst, ArrayList<Instruction> rmvarinst, JobType jt) {
ArrayList<Byte> resultIndices = new ArrayList<>();
ArrayList<String> inputs = new ArrayList<>();
ArrayList<String> outputs = new ArrayList<>();
ArrayList<InputInfo> inputInfos = new ArrayList<>();
ArrayList<OutputInfo> outputInfos = new ArrayList<>();
ArrayList<Long> numRows = new ArrayList<>();
ArrayList<Long> numCols = new ArrayList<>();
ArrayList<Long> numRowsPerBlock = new ArrayList<>();
ArrayList<Long> numColsPerBlock = new ArrayList<>();
ArrayList<String> mapperInstructions = new ArrayList<>();
ArrayList<String> randInstructions = new ArrayList<>();
ArrayList<String> recordReaderInstructions = new ArrayList<>();
int numReducers = 0;
int replication = 1;
ArrayList<String> inputLabels = new ArrayList<>();
ArrayList<String> outputLabels = new ArrayList<>();
ArrayList<Instruction> renameInstructions = new ArrayList<>();
ArrayList<Instruction> variableInstructions = new ArrayList<>();
ArrayList<Instruction> postInstructions = new ArrayList<>();
ArrayList<Integer> MRJobLineNumbers = null;
if (DMLScript.ENABLE_DEBUG_MODE) {
MRJobLineNumbers = new ArrayList<>();
}
ArrayList<Lop> inputLops = new ArrayList<>();
boolean cellModeOverride = false;
/* Find the nodes that produce an output */
ArrayList<Lop> rootNodes = new ArrayList<>();
getOutputNodes(execNodes, rootNodes, jt);
if (LOG.isTraceEnabled())
LOG.trace("# of root nodes = " + rootNodes.size());
/* Remove transient writes that are simple copy of transient reads */
if (jt == JobType.GMR || jt == JobType.GMRCELL) {
ArrayList<Lop> markedNodes = new ArrayList<>();
// only keep data nodes that are results of some computation.
for (Lop rnode : rootNodes) {
if (rnode.getExecLocation() == ExecLocation.Data && ((Data) rnode).isTransient() && ((Data) rnode).getOperationType() == OperationTypes.WRITE && ((Data) rnode).getDataType() == DataType.MATRIX) {
// no computation, just a copy
if (rnode.getInputs().get(0).getExecLocation() == ExecLocation.Data && ((Data) rnode.getInputs().get(0)).isTransient() && rnode.getOutputParameters().getLabel().equals(rnode.getInputs().get(0).getOutputParameters().getLabel())) {
markedNodes.add(rnode);
}
}
}
// delete marked nodes
rootNodes.removeAll(markedNodes);
markedNodes.clear();
if (rootNodes.isEmpty())
return;
}
// structure that maps node to their indices that will be used in the instructions
HashMap<Lop, Integer> nodeIndexMapping = new HashMap<>();
for (Lop rnode : rootNodes) {
getInputPathsAndParameters(rnode, execNodes, inputs, inputInfos, numRows, numCols, numRowsPerBlock, numColsPerBlock, nodeIndexMapping, inputLabels, inputLops, MRJobLineNumbers);
}
// In case of RAND job, instructions are defined in the input file
if (jt == JobType.DATAGEN)
randInstructions = inputs;
int[] start_index = new int[1];
start_index[0] = inputs.size();
// currently, recordreader instructions are allowed only in GMR jobs
if (jt == JobType.GMR || jt == JobType.GMRCELL) {
for (Lop rnode : rootNodes) {
getRecordReaderInstructions(rnode, execNodes, inputs, recordReaderInstructions, nodeIndexMapping, start_index, inputLabels, inputLops, MRJobLineNumbers);
if (recordReaderInstructions.size() > 1)
throw new LopsException("MapReduce job can only have a single recordreader instruction: " + recordReaderInstructions.toString());
}
}
//
if (jt != JobType.REBLOCK && jt != JobType.CSV_REBLOCK && jt != JobType.DATAGEN) {
for (int i = 0; i < inputInfos.size(); i++) if (inputInfos.get(i) == InputInfo.BinaryCellInputInfo || inputInfos.get(i) == InputInfo.TextCellInputInfo)
cellModeOverride = true;
}
if (!recordReaderInstructions.isEmpty() || jt == JobType.GROUPED_AGG)
cellModeOverride = true;
for (int i = 0; i < rootNodes.size(); i++) {
getMapperInstructions(rootNodes.get(i), execNodes, inputs, mapperInstructions, nodeIndexMapping, start_index, inputLabels, inputLops, MRJobLineNumbers);
}
if (LOG.isTraceEnabled()) {
LOG.trace(" Input strings: " + inputs.toString());
if (jt == JobType.DATAGEN)
LOG.trace(" Rand instructions: " + getCSVString(randInstructions));
if (jt == JobType.GMR)
LOG.trace(" RecordReader instructions: " + getCSVString(recordReaderInstructions));
LOG.trace(" Mapper instructions: " + getCSVString(mapperInstructions));
}
/* Get Shuffle and Reducer Instructions */
ArrayList<String> shuffleInstructions = new ArrayList<>();
ArrayList<String> aggInstructionsReducer = new ArrayList<>();
ArrayList<String> otherInstructionsReducer = new ArrayList<>();
for (Lop rn : rootNodes) {
int resultIndex = getAggAndOtherInstructions(rn, execNodes, shuffleInstructions, aggInstructionsReducer, otherInstructionsReducer, nodeIndexMapping, start_index, inputLabels, inputLops, MRJobLineNumbers);
if (resultIndex == -1)
throw new LopsException("Unexpected error in piggybacking!");
if (rn.getExecLocation() == ExecLocation.Data && ((Data) rn).getOperationType() == Data.OperationTypes.WRITE && ((Data) rn).isTransient() && rootNodes.contains(rn.getInputs().get(0))) {
// Both rn (a transient write) and its input are root nodes.
// Instead of creating two copies of the data, simply generate a cpvar instruction
NodeOutput out = setupNodeOutputs(rn, ExecType.MR, cellModeOverride, true);
writeinst.addAll(out.getLastInstructions());
} else {
resultIndices.add(Byte.valueOf((byte) resultIndex));
// setup output filenames and outputInfos and generate related instructions
NodeOutput out = setupNodeOutputs(rn, ExecType.MR, cellModeOverride, false);
outputLabels.add(out.getVarName());
outputs.add(out.getFileName());
outputInfos.add(out.getOutInfo());
if (LOG.isTraceEnabled()) {
LOG.trace(" Output Info: " + out.getFileName() + ";" + OutputInfo.outputInfoToString(out.getOutInfo()) + ";" + out.getVarName());
}
renameInstructions.addAll(out.getLastInstructions());
variableInstructions.addAll(out.getPreInstructions());
postInstructions.addAll(out.getPostInstructions());
}
}
/* Determine if the output dimensions are known */
byte[] resultIndicesByte = new byte[resultIndices.size()];
for (int i = 0; i < resultIndicesByte.length; i++) {
resultIndicesByte[i] = resultIndices.get(i).byteValue();
}
if (LOG.isTraceEnabled()) {
LOG.trace(" Shuffle Instructions: " + getCSVString(shuffleInstructions));
LOG.trace(" Aggregate Instructions: " + getCSVString(aggInstructionsReducer));
LOG.trace(" Other instructions =" + getCSVString(otherInstructionsReducer));
LOG.trace(" Output strings: " + outputs.toString());
LOG.trace(" ResultIndices = " + resultIndices.toString());
}
/* Prepare the MapReduce job instruction */
MRJobInstruction mr = new MRJobInstruction(jt);
// check if this is a map-only job. If not, set the number of reducers
if (!shuffleInstructions.isEmpty() || !aggInstructionsReducer.isEmpty() || !otherInstructionsReducer.isEmpty())
numReducers = total_reducers;
// set inputs, outputs, and other other properties for the job
mr.setInputOutputLabels(inputLabels.toArray(new String[0]), outputLabels.toArray(new String[0]));
mr.setOutputs(resultIndicesByte);
mr.setDimsUnknownFilePrefix(getFilePath());
mr.setNumberOfReducers(numReducers);
mr.setReplication(replication);
// set instructions for recordReader and mapper
mr.setRecordReaderInstructions(getCSVString(recordReaderInstructions));
mr.setMapperInstructions(getCSVString(mapperInstructions));
// compute and set mapper memory requirements (for consistency of runtime piggybacking)
if (jt == JobType.GMR) {
double mem = 0;
for (Lop n : execNodes) mem += computeFootprintInMapper(n);
mr.setMemoryRequirements(mem);
}
if (jt == JobType.DATAGEN)
mr.setRandInstructions(getCSVString(randInstructions));
// set shuffle instructions
mr.setShuffleInstructions(getCSVString(shuffleInstructions));
// set reducer instruction
mr.setAggregateInstructionsInReducer(getCSVString(aggInstructionsReducer));
mr.setOtherInstructionsInReducer(getCSVString(otherInstructionsReducer));
if (DMLScript.ENABLE_DEBUG_MODE) {
// set line number information for each MR instruction
mr.setMRJobInstructionsLineNumbers(MRJobLineNumbers);
}
/* Add the prepared instructions to output set */
inst.addAll(variableInstructions);
inst.add(mr);
inst.addAll(postInstructions);
deleteinst.addAll(renameInstructions);
for (Lop l : inputLops) {
if (DMLScript.ENABLE_DEBUG_MODE) {
processConsumers(l, rmvarinst, deleteinst, l);
} else {
processConsumers(l, rmvarinst, deleteinst, null);
}
}
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class FrameConverterTest method runConverter.
@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
JavaSparkContext sc = sec.getSparkContext();
ValueType[] lschema = schema.toArray(new ValueType[0]);
MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
switch(type) {
case CSV2BIN:
{
InputInfo iinfo = InputInfo.CSVInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2CSV:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
CSVFileFormatProperties fprop = new CSVFileFormatProperties();
JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
rddOut.saveAsTextFile(fnameOut);
break;
}
case TXTCELL2BIN:
{
InputInfo iinfo = InputInfo.TextCellInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2TXTCELL:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
rddOut.saveAsTextFile(fnameOut);
break;
}
case MAT2BIN:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2MAT:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
break;
}
case DFRM2BIN:
{
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
// Create DataFrame
SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2DFRM:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
// Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
default:
throw new RuntimeException("Unsuported converter type: " + type.toString());
}
sec.close();
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class FrameConverterTest method runFrameConverterTest.
/**
* @param schema
* @param type
* @param instType
*/
private void runFrameConverterTest(ValueType[] schema, ConvType type) {
RUNTIME_PLATFORM platformOld = rtplatform;
DMLScript.rtplatform = RUNTIME_PLATFORM.SPARK;
boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
try {
TestConfiguration config = getTestConfiguration(TEST_NAME);
loadTestConfiguration(config);
// data generation
double[][] A = getRandomMatrix(rows, schema.length, -10, 10, 0.9, 2373);
// prepare input/output infos
OutputInfo oinfo = null;
InputInfo iinfo = null;
switch(type) {
case CSV2BIN:
case DFRM2BIN:
oinfo = OutputInfo.CSVOutputInfo;
iinfo = InputInfo.BinaryBlockInputInfo;
break;
case BIN2CSV:
oinfo = OutputInfo.BinaryBlockOutputInfo;
iinfo = InputInfo.CSVInputInfo;
break;
case TXTCELL2BIN:
oinfo = OutputInfo.TextCellOutputInfo;
iinfo = InputInfo.BinaryBlockInputInfo;
break;
case BIN2TXTCELL:
oinfo = OutputInfo.BinaryBlockOutputInfo;
iinfo = InputInfo.TextCellInputInfo;
break;
case MAT2BIN:
case BIN2DFRM:
oinfo = OutputInfo.BinaryBlockOutputInfo;
iinfo = InputInfo.BinaryBlockInputInfo;
break;
case BIN2MAT:
oinfo = OutputInfo.BinaryBlockOutputInfo;
iinfo = InputInfo.BinaryBlockInputInfo;
break;
default:
throw new RuntimeException("Unsuported converter type: " + type.toString());
}
if (type == ConvType.MAT2BIN || type == ConvType.BIN2MAT)
runMatrixConverterAndVerify(schema, A, type, iinfo, oinfo);
else
runConverterAndVerify(schema, A, type, iinfo, oinfo);
} catch (Exception ex) {
ex.printStackTrace();
throw new RuntimeException(ex);
} finally {
DMLScript.rtplatform = platformOld;
DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
}
}
use of org.apache.sysml.runtime.matrix.data.InputInfo in project systemml by apache.
the class CachingPWriteExportTest method runTestExport.
/**
* @param outer execution mode of outer parfor loop
* @param inner execution mode of inner parfor loop
* @param instType execution mode of instructions
*/
private void runTestExport(String outputFormat) {
TestConfiguration config = getTestConfiguration(TEST_NAME);
config.addVariable("rows", rows);
config.addVariable("cols", cols);
loadTestConfiguration(config);
/* This is for running the junit test the new way, i.e., construct the arguments directly */
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME + ".dml";
programArgs = new String[] { "-args", input("V"), Integer.toString(rows), Integer.toString(cols), output("V"), outputFormat };
long seed = System.nanoTime();
double[][] V = getRandomMatrix(rows, cols, 0, 1, sparsity, seed);
// always text
writeInputMatrix("V", V, true);
writeExpectedMatrix("V", V);
boolean exceptionExpected = false;
runTest(true, exceptionExpected, null, -1);
double[][] Vp = null;
try {
InputInfo ii = null;
if (outputFormat.equals("binary"))
ii = InputInfo.BinaryBlockInputInfo;
else
ii = InputInfo.TextCellInputInfo;
MatrixBlock mb = DataConverter.readMatrixFromHDFS(output("V"), ii, rows, cols, OptimizerUtils.DEFAULT_BLOCKSIZE, OptimizerUtils.DEFAULT_BLOCKSIZE, sparsity);
Vp = DataConverter.convertToDoubleMatrix(mb);
} catch (Exception ex) {
ex.printStackTrace();
throw new RuntimeException(ex);
}
// compare
for (int i = 0; i < rows; i++) for (int j = 0; j < cols; j++) if (V[i][j] != Vp[i][j])
Assert.fail("Wrong value i=" + i + ", j=" + j + ", value1=" + V[i][j] + ", value2=" + Vp[i][j]);
}
Aggregations