use of org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction in project incubator-systemml by apache.
the class CSVWriteMapper method configure.
public void configure(JobConf job) {
super.configure(job);
try {
CSVWriteInstruction[] ins = MRJobConfiguration.getCSVWriteInstructions(job);
for (CSVWriteInstruction in : ins) {
ArrayList<Byte> outputs = inputOutputMap.get(in.input);
if (outputs == null) {
outputs = new ArrayList<Byte>();
inputOutputMap.put(in.input, outputs);
}
outputs.add(in.output);
}
} catch (Exception e) {
throw new RuntimeException(e);
}
}
use of org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction in project incubator-systemml by apache.
the class CSVWriteReducer method configure.
@Override
public void configure(JobConf job) {
super.configure(job);
byte maxIndex = 0;
HashMap<Byte, CSVWriteInstruction> out2Ins = new HashMap<Byte, CSVWriteInstruction>();
try {
CSVWriteInstruction[] ins = MRJobConfiguration.getCSVWriteInstructions(job);
for (CSVWriteInstruction in : ins) {
out2Ins.put(in.output, in);
if (in.output > maxIndex)
maxIndex = in.output;
}
} catch (Exception e) {
throw new RuntimeException(e);
}
int numParitions = job.getNumReduceTasks();
int taskID = MapReduceTool.getUniqueTaskId(job);
//LOG.info("## taks id: "+taskID);
//for efficiency only, the arrays may have missing values
rowIndexes = new long[maxIndex + 1];
colIndexes = new long[maxIndex + 1];
maxRowIndexes = new long[maxIndex + 1];
minRowIndexes = new long[maxIndex + 1];
numColBlocks = new long[maxIndex + 1];
lastBlockNCols = new int[maxIndex + 1];
colsPerBlock = new int[maxIndex + 1];
delims = new String[maxIndex + 1];
sparses = new boolean[maxIndex + 1];
tagToResultIndex = new int[maxIndex + 1];
for (int i = 0; i < resultIndexes.length; i++) {
byte ri = resultIndexes[i];
tagToResultIndex[ri] = i;
CSVWriteInstruction in = out2Ins.get(ri);
MatrixCharacteristics dim = MRJobConfiguration.getMatrixCharacteristicsForInput(job, in.input);
delims[ri] = in.delim;
sparses[ri] = in.sparse;
numColBlocks[ri] = (long) Math.ceil((double) dim.getCols() / (double) dim.getColsPerBlock());
lastBlockNCols[ri] = (int) (dim.getCols() % dim.getColsPerBlock());
colsPerBlock[ri] = dim.getColsPerBlock();
long rstep = (long) Math.ceil((double) dim.getRows() / (double) numParitions);
minRowIndexes[ri] = rowIndexes[ri] = rstep * taskID;
maxRowIndexes[ri] = Math.min(rstep * (taskID + 1), dim.getRows());
colIndexes[ri] = 0;
}
zeroBlock.setData(new MatrixBlock());
}
use of org.apache.sysml.runtime.instructions.mr.CSVWriteInstruction in project incubator-systemml by apache.
the class WriteCSVMR method runJob.
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String csvWriteInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs) throws Exception {
JobConf job = new JobConf(WriteCSVMR.class);
job.setJobName("WriteCSV-MR");
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
//set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true, ConvertTarget.CSVWRITE);
//set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
//set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
MRJobConfiguration.setCSVWriteInstructions(job, csvWriteInstructions);
//set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
//set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
long maxRlen = 0;
for (long rlen : rlens) if (rlen > maxRlen)
maxRlen = rlen;
//set up the number of reducers (according to output size)
int numRed = determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS), (int) maxRlen);
job.setNumReduceTasks(numRed);
byte[] resultDimsUnknown = new byte[resultIndexes.length];
MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
OutputInfo[] outputInfos = new OutputInfo[outputs.length];
HashMap<Byte, Integer> indexmap = new HashMap<Byte, Integer>();
for (int i = 0; i < stats.length; i++) {
indexmap.put(resultIndexes[i], i);
resultDimsUnknown[i] = (byte) 0;
stats[i] = new MatrixCharacteristics();
outputInfos[i] = OutputInfo.CSVOutputInfo;
}
CSVWriteInstruction[] ins = MRInstructionParser.parseCSVWriteInstructions(csvWriteInstructions);
for (CSVWriteInstruction in : ins) stats[indexmap.get(in.output)].set(rlens[in.input], clens[in.input], -1, -1);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
//set up what matrices are needed to pass from the mapper to reducer
MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, "", "", csvWriteInstructions, resultIndexes);
//set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true);
// configure mapper and the mapper output key value pairs
job.setMapperClass(CSVWriteMapper.class);
job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
job.setMapOutputValueClass(MatrixBlock.class);
//configure reducer
job.setReducerClass(CSVWriteReducer.class);
job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
job.setPartitionerClass(TaggedFirstSecondIndexes.FirstIndexRangePartitioner.class);
//job.setOutputFormat(UnPaddedOutputFormat.class);
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
RunningJob runjob = JobClient.runJob(job);
/* Process different counters */
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
// number of non-zeros
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Aggregations