use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class ParForProgramBlock method executeRemoteMRParForDP.
private void executeRemoteMRParForDP(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws IOException {
/* Step 0) check and recompile MR inst
* Step 1) serialize child PB and inst
* Step 2) create and serialize tasks
* Step 3) submit MR Jobs and wait for results
* Step 4) collect results from each parallel worker
*/
Timing time = (_monitor ? new Timing(true) : null);
// Step 0) check and compile to CP (if forced remote parfor)
boolean flagForced = checkMRAndRecompileToCP(0);
// Step 1) prepare partitioned input matrix (needs to happen before serializing the program)
ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
MatrixObject inputMatrix = ec.getMatrixObject(_colocatedDPMatrix);
PartitionFormat inputDPF = sb.determineDataPartitionFormat(_colocatedDPMatrix);
// mark matrix var as partitioned
inputMatrix.setPartitioned(inputDPF._dpf, inputDPF._N);
// Step 2) init parallel workers (serialize PBs)
// NOTES: each mapper changes filenames with regard to his ID as we submit a single
// job, cannot reuse serialized string, since variables are serialized as well.
ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
String program = ProgramConverter.serializeParForBody(body);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
// Step 3) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
String resultFile = constructResultFileName();
long numIterations = partitioner.getNumIterations();
// partitioner.createTasks().size();
long numCreatedTasks = numIterations;
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
// write matrices to HDFS
exportMatricesToHDFS(ec);
// Step 4) submit MR job (wait for finished work)
OutputInfo inputOI = ((inputMatrix.getSparsity() < 0.1 && inputDPF == PartitionFormat.COLUMN_WISE) || (inputMatrix.getSparsity() < 0.001 && inputDPF == PartitionFormat.ROW_WISE)) ? OutputInfo.BinaryCellOutputInfo : OutputInfo.BinaryBlockOutputInfo;
RemoteParForJobReturn ret = RemoteDPParForMR.runJob(_ID, _iterPredVar, _colocatedDPMatrix, program, resultFile, inputMatrix, inputDPF, inputOI, _tSparseCol, _enableCPCaching, _numThreads, _replicationDP);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 5) collecting results from each parallel worker
int numExecutedTasks = ret.getNumExecutedTasks();
int numExecutedIterations = ret.getNumExecutedIterations();
// consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
if (// see step 0
flagForced)
releaseForcedRecompile(0);
inputMatrix.unsetPartitioned();
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class ParForProgramBlock method executeRemoteSparkParForDP.
private void executeRemoteSparkParForDP(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws IOException {
Timing time = (_monitor ? new Timing(true) : null);
// Step 0) check and compile to CP (if forced remote parfor)
boolean flagForced = checkMRAndRecompileToCP(0);
// Step 1) prepare partitioned input matrix (needs to happen before serializing the program)
ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
MatrixObject inputMatrix = ec.getMatrixObject(_colocatedDPMatrix);
PartitionFormat inputDPF = sb.determineDataPartitionFormat(_colocatedDPMatrix);
// mark matrix var as partitioned
inputMatrix.setPartitioned(inputDPF._dpf, inputDPF._N);
// Step 2) init parallel workers (serialize PBs)
// NOTES: each mapper changes filenames with regard to his ID as we submit a single
// job, cannot reuse serialized string, since variables are serialized as well.
ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
HashMap<String, byte[]> clsMap = new HashMap<>();
String program = ProgramConverter.serializeParForBody(body, clsMap);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
// Step 3) create tasks
TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
String resultFile = constructResultFileName();
long numIterations = partitioner.getNumIterations();
// partitioner.createTasks().size();
long numCreatedTasks = numIterations;
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
// write matrices to HDFS, except DP matrix which is the input to the RemoteDPParForSpark job
exportMatricesToHDFS(ec, _colocatedDPMatrix);
// Step 4) submit MR job (wait for finished work)
// TODO runtime support for binary cell partitioning
OutputInfo inputOI = OutputInfo.BinaryBlockOutputInfo;
RemoteParForJobReturn ret = RemoteDPParForSpark.runJob(_ID, _iterPredVar, _colocatedDPMatrix, program, clsMap, resultFile, inputMatrix, ec, inputDPF, inputOI, _tSparseCol, _enableCPCaching, _numThreads);
if (_monitor)
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
// Step 5) collecting results from each parallel worker
int numExecutedTasks = ret.getNumExecutedTasks();
int numExecutedIterations = ret.getNumExecutedIterations();
// consolidate results into global symbol table
consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
if (// see step 0
flagForced)
releaseForcedRecompile(0);
inputMatrix.unsetPartitioned();
if (_monitor) {
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
}
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class VariableCPInstruction method parseInstruction.
public static VariableCPInstruction parseInstruction(String str) {
String[] parts = InstructionUtils.getInstructionPartsWithValueType(str);
String opcode = parts[0];
VariableOperationCode voc = getVariableOperationCode(opcode);
if (voc == VariableOperationCode.CreateVariable) {
if (// && parts.length != 10 )
parts.length < 5)
throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
} else if (voc == VariableOperationCode.MoveVariable) {
// mvvar tempA A; or mvvar mvar5 "data/out.mtx" "binary"
if (parts.length != 3 && parts.length != 4)
throw new DMLRuntimeException("Invalid number of operands in mvvar instruction: " + str);
} else if (voc == VariableOperationCode.Write) {
// Write instructions for csv files also include three additional parameters (hasHeader, delimiter, sparse)
if (parts.length != 5 && parts.length != 8)
throw new DMLRuntimeException("Invalid number of operands in write instruction: " + str);
} else {
if (voc != VariableOperationCode.RemoveVariable)
// no output
InstructionUtils.checkNumFields(parts, getArity(voc));
}
CPOperand in1 = null, in2 = null, in3 = null, in4 = null, out = null;
switch(voc) {
case CreateVariable:
// variable name
DataType dt = DataType.valueOf(parts[4]);
ValueType vt = dt == DataType.MATRIX ? ValueType.DOUBLE : ValueType.STRING;
int extSchema = (dt == DataType.FRAME && parts.length >= 13) ? 1 : 0;
in1 = new CPOperand(parts[1], vt, dt);
// file name
in2 = new CPOperand(parts[2], ValueType.STRING, DataType.SCALAR);
// file name override flag (always literal)
in3 = new CPOperand(parts[3], ValueType.BOOLEAN, DataType.SCALAR);
// format
String fmt = parts[5];
if (fmt.equalsIgnoreCase("csv")) {
// 14 inputs: createvar corresponding to READ -- includes properties hasHeader, delim, fill, and fillValue
if (parts.length < 15 + extSchema || parts.length > 17 + extSchema)
throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
} else {
if (parts.length != 6 && parts.length != 12 + extSchema)
throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
}
OutputInfo oi = OutputInfo.stringToOutputInfo(fmt);
InputInfo ii = OutputInfo.getMatchingInputInfo(oi);
MatrixCharacteristics mc = new MatrixCharacteristics();
if (parts.length == 6) {
// do nothing
} else if (parts.length >= 11) {
// matrix characteristics
mc.setDimension(Long.parseLong(parts[6]), Long.parseLong(parts[7]));
mc.setBlockSize(Integer.parseInt(parts[8]), Integer.parseInt(parts[9]));
mc.setNonZeros(Long.parseLong(parts[10]));
} else {
throw new DMLRuntimeException("Invalid number of operands in createvar instruction: " + str);
}
MetaDataFormat iimd = new MetaDataFormat(mc, oi, ii);
UpdateType updateType = UpdateType.COPY;
if (parts.length >= 12)
updateType = UpdateType.valueOf(parts[11].toUpperCase());
// handle frame schema
String schema = (dt == DataType.FRAME && parts.length >= 13) ? parts[parts.length - 1] : null;
if (fmt.equalsIgnoreCase("csv")) {
// Cretevar instructions for CSV format either has 13 or 14 inputs.
// 13 inputs: createvar corresponding to WRITE -- includes properties hasHeader, delim, and sparse
// 14 inputs: createvar corresponding to READ -- includes properties hasHeader, delim, fill, and fillValue
FileFormatProperties fmtProperties = null;
if (parts.length == 15 + extSchema) {
boolean hasHeader = Boolean.parseBoolean(parts[12]);
String delim = parts[13];
boolean sparse = Boolean.parseBoolean(parts[14]);
fmtProperties = new CSVFileFormatProperties(hasHeader, delim, sparse);
} else {
boolean hasHeader = Boolean.parseBoolean(parts[12]);
String delim = parts[13];
boolean fill = Boolean.parseBoolean(parts[14]);
double fillValue = UtilFunctions.parseToDouble(parts[15]);
String naStrings = null;
if (parts.length == 17 + extSchema)
naStrings = parts[16];
fmtProperties = new CSVFileFormatProperties(hasHeader, delim, fill, fillValue, naStrings);
}
return new VariableCPInstruction(VariableOperationCode.CreateVariable, in1, in2, in3, iimd, updateType, fmtProperties, schema, opcode, str);
} else {
return new VariableCPInstruction(VariableOperationCode.CreateVariable, in1, in2, in3, iimd, updateType, schema, opcode, str);
}
case AssignVariable:
in1 = new CPOperand(parts[1]);
in2 = new CPOperand(parts[2]);
break;
case CopyVariable:
// Value types are not given here
in1 = new CPOperand(parts[1], ValueType.UNKNOWN, DataType.UNKNOWN);
in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
break;
case MoveVariable:
in1 = new CPOperand(parts[1], ValueType.UNKNOWN, DataType.UNKNOWN);
in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
if (parts.length > 3)
in3 = new CPOperand(parts[3], ValueType.UNKNOWN, DataType.UNKNOWN);
break;
case RemoveVariable:
VariableCPInstruction rminst = new VariableCPInstruction(getVariableOperationCode(opcode), null, null, null, out, opcode, str);
for (int i = 1; i < parts.length; i++) rminst.addInput(new CPOperand(parts[i], ValueType.UNKNOWN, DataType.SCALAR));
return rminst;
case RemoveVariableAndFile:
in1 = new CPOperand(parts[1]);
in2 = new CPOperand(parts[2]);
// second argument must be a boolean
if (in2.getValueType() != ValueType.BOOLEAN)
throw new DMLRuntimeException("Unexpected value type for second argument in: " + str);
break;
case CastAsScalarVariable:
case CastAsMatrixVariable:
case CastAsFrameVariable:
case CastAsDoubleVariable:
case CastAsIntegerVariable:
case CastAsBooleanVariable:
// first operand is a variable name => string value type
in1 = new CPOperand(parts[1]);
// output variable name
out = new CPOperand(parts[2]);
break;
case Write:
in1 = new CPOperand(parts[1]);
in2 = new CPOperand(parts[2]);
in3 = new CPOperand(parts[3]);
FileFormatProperties fprops = null;
if (in3.getName().equalsIgnoreCase("csv")) {
boolean hasHeader = Boolean.parseBoolean(parts[4]);
String delim = parts[5];
boolean sparse = Boolean.parseBoolean(parts[6]);
fprops = new CSVFileFormatProperties(hasHeader, delim, sparse);
// description
in4 = new CPOperand(parts[7]);
} else {
fprops = new FileFormatProperties();
// description
in4 = new CPOperand(parts[4]);
}
VariableCPInstruction inst = new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, out, null, fprops, null, null, opcode, str);
inst.addInput(in4);
return inst;
case Read:
in1 = new CPOperand(parts[1]);
in2 = new CPOperand(parts[2]);
out = null;
break;
case SetFileName:
// variable name
in1 = new CPOperand(parts[1]);
// file name
in2 = new CPOperand(parts[2], ValueType.UNKNOWN, DataType.UNKNOWN);
// option: remote or local
in3 = new CPOperand(parts[3], ValueType.UNKNOWN, DataType.UNKNOWN);
// return new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, str);
break;
}
return new VariableCPInstruction(getVariableOperationCode(opcode), in1, in2, in3, out, opcode, str);
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class WriteSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
// get filename (literal or variable expression)
String fname = ec.getScalarInput(input2.getName(), ValueType.STRING, input2.isLiteral()).getStringValue();
String desc = ec.getScalarInput(input4.getName(), ValueType.STRING, input4.isLiteral()).getStringValue();
formatProperties.setDescription(desc);
ValueType[] schema = (input1.getDataType() == DataType.FRAME) ? sec.getFrameObject(input1.getName()).getSchema() : null;
try {
// if the file already exists on HDFS, remove it.
MapReduceTool.deleteFileIfExistOnHDFS(fname);
// prepare output info according to meta data
String outFmt = input3.getName();
OutputInfo oi = OutputInfo.stringToOutputInfo(outFmt);
// core matrix/frame write
if (input1.getDataType() == DataType.MATRIX)
processMatrixWriteInstruction(sec, fname, oi);
else
processFrameWriteInstruction(sec, fname, oi, schema);
} catch (IOException ex) {
throw new DMLRuntimeException("Failed to process write instruction", ex);
}
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project incubator-systemml by apache.
the class WriteCSVMR method runJob.
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String csvWriteInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs) throws Exception {
JobConf job = new JobConf(WriteCSVMR.class);
job.setJobName("WriteCSV-MR");
// check for valid output dimensions
for (int i = 0; i < rlens.length; i++) if (rlens[i] == 0 || clens[i] == 0)
throw new IOException("Write of matrices with zero" + " rows or columns not supported (" + rlens[i] + "x" + clens[i] + ").");
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
// set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true, ConvertTarget.CSVWRITE);
// set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
// set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
MRJobConfiguration.setCSVWriteInstructions(job, csvWriteInstructions);
// set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
// set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
long maxRlen = 0;
for (long rlen : rlens) if (rlen > maxRlen)
maxRlen = rlen;
// set up the number of reducers (according to output size)
int numRed = determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS), (int) maxRlen);
job.setNumReduceTasks(numRed);
byte[] resultDimsUnknown = new byte[resultIndexes.length];
MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
OutputInfo[] outputInfos = new OutputInfo[outputs.length];
HashMap<Byte, Integer> indexmap = new HashMap<>();
for (int i = 0; i < stats.length; i++) {
indexmap.put(resultIndexes[i], i);
resultDimsUnknown[i] = (byte) 0;
stats[i] = new MatrixCharacteristics();
outputInfos[i] = OutputInfo.CSVOutputInfo;
}
CSVWriteInstruction[] ins = MRInstructionParser.parseCSVWriteInstructions(csvWriteInstructions);
for (CSVWriteInstruction in : ins) stats[indexmap.get(in.output)].set(rlens[in.input], clens[in.input], -1, -1);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
// set up what matrices are needed to pass from the mapper to reducer
MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, "", "", csvWriteInstructions, resultIndexes);
// set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true);
// configure mapper and the mapper output key value pairs
job.setMapperClass(CSVWriteMapper.class);
job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
job.setMapOutputValueClass(MatrixBlock.class);
// configure reducer
job.setReducerClass(CSVWriteReducer.class);
job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
job.setPartitionerClass(TaggedFirstSecondIndexes.FirstIndexRangePartitioner.class);
// job.setOutputFormat(UnPaddedOutputFormat.class);
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
RunningJob runjob = JobClient.runJob(job);
/* Process different counters */
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
// number of non-zeros
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Aggregations