use of org.apache.sysml.runtime.matrix.data.OutputInfo in project systemml by apache.
the class DataPartitionerRemoteReducer method configure.
public void configure(JobConf job) {
String fnameNew = MRJobConfiguration.getPartitioningFilename(job);
OutputInfo oi = MRJobConfiguration.getPartitioningOutputInfo(job);
if (oi == OutputInfo.TextCellOutputInfo)
_reducer = new DataPartitionerReducerTextcell(job, fnameNew);
else if (oi == OutputInfo.BinaryCellOutputInfo)
_reducer = new DataPartitionerReducerBinarycell(job, fnameNew);
else if (oi == OutputInfo.BinaryBlockOutputInfo)
_reducer = new DataPartitionerReducerBinaryblock(job, fnameNew);
else
throw new RuntimeException("Unable to configure reducer with unknown output info: " + oi.toString());
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project systemml by apache.
the class JMLCInputStreamReadTest method runJMLCInputStreamReadTest.
private void runJMLCInputStreamReadTest(DataType dt, boolean sparse, String format, boolean metaData) throws IOException {
TestConfiguration config = getTestConfiguration(TEST_NAME);
loadTestConfiguration(config);
// generate inputs
OutputInfo oinfo = format.equals("csv") ? OutputInfo.CSVOutputInfo : OutputInfo.TextCellOutputInfo;
double[][] data = TestUtils.round(getRandomMatrix(rows, cols, 0.51, 7.49, sparse ? sparsity2 : sparsity1, 7));
Connection conn = new Connection();
try {
if (dt == DataType.MATRIX) {
// write input matrix
MatrixBlock mb = DataConverter.convertToMatrixBlock(data);
MatrixWriter writer = MatrixWriterFactory.createMatrixWriter(oinfo);
writer.writeMatrixToHDFS(mb, output("X"), rows, cols, -1, -1, -1);
// read matrix from input stream
FileInputStream fis = new FileInputStream(output("X"));
double[][] data2 = conn.convertToDoubleMatrix(fis, rows, cols, format);
fis.close();
// compare matrix result
TestUtils.compareMatrices(data, data2, rows, cols, 0);
} else if (dt == DataType.FRAME) {
// write input frame
String[][] fdata = FrameTransformTest.createFrameData(data, "V");
// test quoted tokens w/ inner quotes
fdata[3][1] = "\"ab\"\"cdef\"";
if (format.equals("csv"))
// test delimiter and space tokens
fdata[7][2] = "\"a,bc def\"";
FrameBlock fb = DataConverter.convertToFrameBlock(fdata);
if (metaData) {
fb.setColumnNames(IntStream.range(0, cols).mapToObj(i -> "CC" + i).collect(Collectors.toList()).toArray(new String[0]));
}
FrameWriter writer = FrameWriterFactory.createFrameWriter(oinfo);
writer.writeFrameToHDFS(fb, output("X"), rows, cols);
// read frame from input stream
FileInputStream fis = new FileInputStream(output("X"));
String[][] fdata2 = conn.convertToStringFrame(fis, rows, cols, format);
fis.close();
// compare frame result
TestUtils.compareFrames(fdata, fdata2, rows, cols);
} else {
throw new IOException("Unsupported data type: " + dt.name());
}
} catch (Exception ex) {
throw new RuntimeException(ex);
} finally {
MapReduceTool.deleteFileIfExistOnHDFS(output("X"));
IOUtilFunctions.closeSilently(conn);
}
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project systemml by apache.
the class SortMR method runStitchupJob.
private static boolean runStitchupJob(String input, long rlen, long clen, int brlen, int bclen, long[] counts, int numReducers, int replication, String output) throws Exception {
JobConf job = new JobConf(SortMR.class);
job.setJobName("SortIndexesMR");
// setup input/output paths
Path inpath = new Path(input);
Path outpath = new Path(output);
FileInputFormat.setInputPaths(job, inpath);
FileOutputFormat.setOutputPath(job, outpath);
MapReduceTool.deleteFileIfExistOnHDFS(outpath, job);
// set number of reducers (1 if local mode)
if (InfrastructureAnalyzer.isLocalMode(job))
job.setNumReduceTasks(1);
else
MRJobConfiguration.setNumReducers(job, numReducers, numReducers);
// setup input/output format
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
job.setInputFormat(iinfo.inputFormatClass);
job.setOutputFormat(oinfo.outputFormatClass);
CompactInputFormat.setKeyValueClasses(job, MatrixIndexes.class, MatrixBlock.class);
// setup mapper/reducer/output classes
MRJobConfiguration.setInputInfo(job, (byte) 0, InputInfo.BinaryBlockInputInfo, brlen, bclen, ConvertTarget.BLOCK);
job.setMapperClass(IndexSortStitchupMapper.class);
job.setReducerClass(IndexSortStitchupReducer.class);
job.setOutputKeyClass(oinfo.outputKeyClass);
job.setOutputValueClass(oinfo.outputValueClass);
MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen);
MRJobConfiguration.setMatricesDimensions(job, new byte[] { 0 }, new long[] { rlen }, new long[] { clen });
// compute shifted prefix sum of offsets and put into configuration
long[] cumsumCounts = new long[counts.length];
long sum = 0;
for (int i = 0; i < counts.length; i++) {
cumsumCounts[i] = sum;
sum += counts[i];
}
job.set(SORT_INDEXES_OFFSETS, Arrays.toString(cumsumCounts));
// setup replication factor
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
// run mr job
RunningJob runJob = JobClient.runJob(job);
return runJob.isSuccessful();
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project systemml by apache.
the class WriteCSVMR method runJob.
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String csvWriteInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs) throws Exception {
JobConf job = new JobConf(WriteCSVMR.class);
job.setJobName("WriteCSV-MR");
// check for valid output dimensions
for (int i = 0; i < rlens.length; i++) if (rlens[i] == 0 || clens[i] == 0)
throw new IOException("Write of matrices with zero" + " rows or columns not supported (" + rlens[i] + "x" + clens[i] + ").");
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
// set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true, ConvertTarget.CSVWRITE);
// set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
// set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
MRJobConfiguration.setCSVWriteInstructions(job, csvWriteInstructions);
// set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
// set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
long maxRlen = 0;
for (long rlen : rlens) if (rlen > maxRlen)
maxRlen = rlen;
// set up the number of reducers (according to output size)
int numRed = determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS), (int) maxRlen);
job.setNumReduceTasks(numRed);
byte[] resultDimsUnknown = new byte[resultIndexes.length];
MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
OutputInfo[] outputInfos = new OutputInfo[outputs.length];
HashMap<Byte, Integer> indexmap = new HashMap<>();
for (int i = 0; i < stats.length; i++) {
indexmap.put(resultIndexes[i], i);
resultDimsUnknown[i] = (byte) 0;
stats[i] = new MatrixCharacteristics();
outputInfos[i] = OutputInfo.CSVOutputInfo;
}
CSVWriteInstruction[] ins = MRInstructionParser.parseCSVWriteInstructions(csvWriteInstructions);
for (CSVWriteInstruction in : ins) stats[indexmap.get(in.output)].set(rlens[in.input], clens[in.input], -1, -1);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
// set up what matrices are needed to pass from the mapper to reducer
MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, "", "", csvWriteInstructions, resultIndexes);
// set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true);
// configure mapper and the mapper output key value pairs
job.setMapperClass(CSVWriteMapper.class);
job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
job.setMapOutputValueClass(MatrixBlock.class);
// configure reducer
job.setReducerClass(CSVWriteReducer.class);
job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
job.setPartitionerClass(TaggedFirstSecondIndexes.FirstIndexRangePartitioner.class);
// job.setOutputFormat(UnPaddedOutputFormat.class);
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
RunningJob runjob = JobClient.runJob(job);
/* Process different counters */
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
// number of non-zeros
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
use of org.apache.sysml.runtime.matrix.data.OutputInfo in project systemml by apache.
the class Dag method generateMapReduceInstructions.
/**
* Method to generate MapReduce job instructions from a given set of nodes.
*
* @param execNodes list of exec nodes
* @param inst list of instructions
* @param writeinst list of write instructions
* @param deleteinst list of delete instructions
* @param rmvarinst list of rmvar instructions
* @param jt job type
*/
private void generateMapReduceInstructions(ArrayList<Lop> execNodes, ArrayList<Instruction> inst, ArrayList<Instruction> writeinst, ArrayList<Instruction> deleteinst, ArrayList<Instruction> rmvarinst, JobType jt) {
ArrayList<Byte> resultIndices = new ArrayList<>();
ArrayList<String> inputs = new ArrayList<>();
ArrayList<String> outputs = new ArrayList<>();
ArrayList<InputInfo> inputInfos = new ArrayList<>();
ArrayList<OutputInfo> outputInfos = new ArrayList<>();
ArrayList<Long> numRows = new ArrayList<>();
ArrayList<Long> numCols = new ArrayList<>();
ArrayList<Long> numRowsPerBlock = new ArrayList<>();
ArrayList<Long> numColsPerBlock = new ArrayList<>();
ArrayList<String> mapperInstructions = new ArrayList<>();
ArrayList<String> randInstructions = new ArrayList<>();
ArrayList<String> recordReaderInstructions = new ArrayList<>();
int numReducers = 0;
int replication = 1;
ArrayList<String> inputLabels = new ArrayList<>();
ArrayList<String> outputLabels = new ArrayList<>();
ArrayList<Instruction> renameInstructions = new ArrayList<>();
ArrayList<Instruction> variableInstructions = new ArrayList<>();
ArrayList<Instruction> postInstructions = new ArrayList<>();
ArrayList<Integer> MRJobLineNumbers = null;
if (DMLScript.ENABLE_DEBUG_MODE) {
MRJobLineNumbers = new ArrayList<>();
}
ArrayList<Lop> inputLops = new ArrayList<>();
boolean cellModeOverride = false;
/* Find the nodes that produce an output */
ArrayList<Lop> rootNodes = new ArrayList<>();
getOutputNodes(execNodes, rootNodes, jt);
if (LOG.isTraceEnabled())
LOG.trace("# of root nodes = " + rootNodes.size());
/* Remove transient writes that are simple copy of transient reads */
if (jt == JobType.GMR || jt == JobType.GMRCELL) {
ArrayList<Lop> markedNodes = new ArrayList<>();
// only keep data nodes that are results of some computation.
for (Lop rnode : rootNodes) {
if (rnode.getExecLocation() == ExecLocation.Data && ((Data) rnode).isTransient() && ((Data) rnode).getOperationType() == OperationTypes.WRITE && ((Data) rnode).getDataType() == DataType.MATRIX) {
// no computation, just a copy
if (rnode.getInputs().get(0).getExecLocation() == ExecLocation.Data && ((Data) rnode.getInputs().get(0)).isTransient() && rnode.getOutputParameters().getLabel().equals(rnode.getInputs().get(0).getOutputParameters().getLabel())) {
markedNodes.add(rnode);
}
}
}
// delete marked nodes
rootNodes.removeAll(markedNodes);
markedNodes.clear();
if (rootNodes.isEmpty())
return;
}
// structure that maps node to their indices that will be used in the instructions
HashMap<Lop, Integer> nodeIndexMapping = new HashMap<>();
for (Lop rnode : rootNodes) {
getInputPathsAndParameters(rnode, execNodes, inputs, inputInfos, numRows, numCols, numRowsPerBlock, numColsPerBlock, nodeIndexMapping, inputLabels, inputLops, MRJobLineNumbers);
}
// In case of RAND job, instructions are defined in the input file
if (jt == JobType.DATAGEN)
randInstructions = inputs;
int[] start_index = new int[1];
start_index[0] = inputs.size();
// currently, recordreader instructions are allowed only in GMR jobs
if (jt == JobType.GMR || jt == JobType.GMRCELL) {
for (Lop rnode : rootNodes) {
getRecordReaderInstructions(rnode, execNodes, inputs, recordReaderInstructions, nodeIndexMapping, start_index, inputLabels, inputLops, MRJobLineNumbers);
if (recordReaderInstructions.size() > 1)
throw new LopsException("MapReduce job can only have a single recordreader instruction: " + recordReaderInstructions.toString());
}
}
//
if (jt != JobType.REBLOCK && jt != JobType.CSV_REBLOCK && jt != JobType.DATAGEN) {
for (int i = 0; i < inputInfos.size(); i++) if (inputInfos.get(i) == InputInfo.BinaryCellInputInfo || inputInfos.get(i) == InputInfo.TextCellInputInfo)
cellModeOverride = true;
}
if (!recordReaderInstructions.isEmpty() || jt == JobType.GROUPED_AGG)
cellModeOverride = true;
for (int i = 0; i < rootNodes.size(); i++) {
getMapperInstructions(rootNodes.get(i), execNodes, inputs, mapperInstructions, nodeIndexMapping, start_index, inputLabels, inputLops, MRJobLineNumbers);
}
if (LOG.isTraceEnabled()) {
LOG.trace(" Input strings: " + inputs.toString());
if (jt == JobType.DATAGEN)
LOG.trace(" Rand instructions: " + getCSVString(randInstructions));
if (jt == JobType.GMR)
LOG.trace(" RecordReader instructions: " + getCSVString(recordReaderInstructions));
LOG.trace(" Mapper instructions: " + getCSVString(mapperInstructions));
}
/* Get Shuffle and Reducer Instructions */
ArrayList<String> shuffleInstructions = new ArrayList<>();
ArrayList<String> aggInstructionsReducer = new ArrayList<>();
ArrayList<String> otherInstructionsReducer = new ArrayList<>();
for (Lop rn : rootNodes) {
int resultIndex = getAggAndOtherInstructions(rn, execNodes, shuffleInstructions, aggInstructionsReducer, otherInstructionsReducer, nodeIndexMapping, start_index, inputLabels, inputLops, MRJobLineNumbers);
if (resultIndex == -1)
throw new LopsException("Unexpected error in piggybacking!");
if (rn.getExecLocation() == ExecLocation.Data && ((Data) rn).getOperationType() == Data.OperationTypes.WRITE && ((Data) rn).isTransient() && rootNodes.contains(rn.getInputs().get(0))) {
// Both rn (a transient write) and its input are root nodes.
// Instead of creating two copies of the data, simply generate a cpvar instruction
NodeOutput out = setupNodeOutputs(rn, ExecType.MR, cellModeOverride, true);
writeinst.addAll(out.getLastInstructions());
} else {
resultIndices.add(Byte.valueOf((byte) resultIndex));
// setup output filenames and outputInfos and generate related instructions
NodeOutput out = setupNodeOutputs(rn, ExecType.MR, cellModeOverride, false);
outputLabels.add(out.getVarName());
outputs.add(out.getFileName());
outputInfos.add(out.getOutInfo());
if (LOG.isTraceEnabled()) {
LOG.trace(" Output Info: " + out.getFileName() + ";" + OutputInfo.outputInfoToString(out.getOutInfo()) + ";" + out.getVarName());
}
renameInstructions.addAll(out.getLastInstructions());
variableInstructions.addAll(out.getPreInstructions());
postInstructions.addAll(out.getPostInstructions());
}
}
/* Determine if the output dimensions are known */
byte[] resultIndicesByte = new byte[resultIndices.size()];
for (int i = 0; i < resultIndicesByte.length; i++) {
resultIndicesByte[i] = resultIndices.get(i).byteValue();
}
if (LOG.isTraceEnabled()) {
LOG.trace(" Shuffle Instructions: " + getCSVString(shuffleInstructions));
LOG.trace(" Aggregate Instructions: " + getCSVString(aggInstructionsReducer));
LOG.trace(" Other instructions =" + getCSVString(otherInstructionsReducer));
LOG.trace(" Output strings: " + outputs.toString());
LOG.trace(" ResultIndices = " + resultIndices.toString());
}
/* Prepare the MapReduce job instruction */
MRJobInstruction mr = new MRJobInstruction(jt);
// check if this is a map-only job. If not, set the number of reducers
if (!shuffleInstructions.isEmpty() || !aggInstructionsReducer.isEmpty() || !otherInstructionsReducer.isEmpty())
numReducers = total_reducers;
// set inputs, outputs, and other other properties for the job
mr.setInputOutputLabels(inputLabels.toArray(new String[0]), outputLabels.toArray(new String[0]));
mr.setOutputs(resultIndicesByte);
mr.setDimsUnknownFilePrefix(getFilePath());
mr.setNumberOfReducers(numReducers);
mr.setReplication(replication);
// set instructions for recordReader and mapper
mr.setRecordReaderInstructions(getCSVString(recordReaderInstructions));
mr.setMapperInstructions(getCSVString(mapperInstructions));
// compute and set mapper memory requirements (for consistency of runtime piggybacking)
if (jt == JobType.GMR) {
double mem = 0;
for (Lop n : execNodes) mem += computeFootprintInMapper(n);
mr.setMemoryRequirements(mem);
}
if (jt == JobType.DATAGEN)
mr.setRandInstructions(getCSVString(randInstructions));
// set shuffle instructions
mr.setShuffleInstructions(getCSVString(shuffleInstructions));
// set reducer instruction
mr.setAggregateInstructionsInReducer(getCSVString(aggInstructionsReducer));
mr.setOtherInstructionsInReducer(getCSVString(otherInstructionsReducer));
if (DMLScript.ENABLE_DEBUG_MODE) {
// set line number information for each MR instruction
mr.setMRJobInstructionsLineNumbers(MRJobLineNumbers);
}
/* Add the prepared instructions to output set */
inst.addAll(variableInstructions);
inst.add(mr);
inst.addAll(postInstructions);
deleteinst.addAll(renameInstructions);
for (Lop l : inputLops) {
if (DMLScript.ENABLE_DEBUG_MODE) {
processConsumers(l, rmvarinst, deleteinst, l);
} else {
processConsumers(l, rmvarinst, deleteinst, null);
}
}
}
Aggregations