use of org.apache.hadoop.mapred.Counters.Group in project incubator-systemml by apache.
the class GroupedAggMR method runJob.
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String grpAggInstructions, String simpleReduceInstructions, /*only scalar or reorg instructions allowed*/
int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
JobConf job = new JobConf(GroupedAggMR.class);
job.setJobName("GroupedAgg-MR");
//whether use block representation or cell representation
//MRJobConfiguration.setMatrixValueClassForCM_N_COM(job, true);
MRJobConfiguration.setMatrixValueClass(job, false);
//added for handling recordreader instruction
String[] realinputs = inputs;
InputInfo[] realinputInfos = inputInfos;
long[] realrlens = rlens;
long[] realclens = clens;
int[] realbrlens = brlens;
int[] realbclens = bclens;
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
//set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, realinputs, realinputInfos, realbrlens, realbclens, true, ConvertTarget.WEIGHTEDCELL);
//set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, realrlens, realclens);
MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
//set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, realbrlens, realbclens);
//set up the grouped aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setGroupedAggInstructions(job, grpAggInstructions);
//set up the instructions that will happen in the reducer, after the aggregation instrucions
MRJobConfiguration.setInstructionsInReducer(job, simpleReduceInstructions);
//set up the number of reducers
MRJobConfiguration.setNumReducers(job, numReducers, numReducers);
//set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
//set up what matrices are needed to pass from the mapper to reducer
MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, null, grpAggInstructions, resultIndexes);
MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
for (int i = 0; i < resultIndexes.length; i++) stats[i] = new MatrixCharacteristics();
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
byte[] resultDimsUnknown = new byte[resultIndexes.length];
// Update resultDimsUnknown based on computed "stats"
for (int i = 0; i < resultIndexes.length; i++) resultDimsUnknown[i] = (byte) 2;
//set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, false);
// configure mapper and the mapper output key value pairs
job.setMapperClass(GroupedAggMRMapper.class);
job.setCombinerClass(GroupedAggMRCombiner.class);
job.setMapOutputKeyClass(TaggedMatrixIndexes.class);
job.setMapOutputValueClass(WeightedCell.class);
//configure reducer
job.setReducerClass(GroupedAggMRReducer.class);
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
//execute job
RunningJob runjob = JobClient.runJob(job);
//get important output statistics
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
// number of non-zeros
stats[i] = new MatrixCharacteristics();
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
stats = MapReduceTool.processDimsFiles(dir, stats);
MapReduceTool.deleteFileIfExistOnHDFS(dir);
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
use of org.apache.hadoop.mapred.Counters.Group in project incubator-systemml by apache.
the class MMCJMR method runJob.
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String aggInstructionsInReducer, String aggBinInstrction, int numReducers, int replication, String output, OutputInfo outputinfo) throws Exception {
JobConf job = new JobConf(MMCJMR.class);
// TODO: check w/ yuanyuan. This job always runs in blocked mode, and hence derivation is not necessary.
boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);
// by default, assume that dimensions of MMCJ's output are known at compile time
byte resultDimsUnknown = (byte) 0;
MatrixCharacteristics[] stats = commonSetup(job, inBlockRepresentation, inputs, inputInfos, rlens, clens, brlens, bclens, instructionsInMapper, aggInstructionsInReducer, aggBinInstrction, numReducers, replication, resultDimsUnknown, output, outputinfo);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
// There is always a single output
if (stats[0].getRows() == -1 || stats[0].getCols() == -1) {
resultDimsUnknown = (byte) 1;
// if the dimensions are unknown, then setup done in commonSetup() must be updated
byte[] resultIndexes = new byte[] { MRInstructionParser.parseSingleInstruction(aggBinInstrction).output };
byte[] resultDimsUnknown_Array = new byte[] { resultDimsUnknown };
//set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown_Array, new String[] { output }, new OutputInfo[] { outputinfo }, inBlockRepresentation);
}
AggregateBinaryInstruction ins = (AggregateBinaryInstruction) MRInstructionParser.parseSingleInstruction(aggBinInstrction);
MatrixCharacteristics dim1 = MRJobConfiguration.getMatrixCharactristicsForBinAgg(job, ins.input1);
MatrixCharacteristics dim2 = MRJobConfiguration.getMatrixCharactristicsForBinAgg(job, ins.input2);
if (dim1.getRowsPerBlock() > dim1.getRows())
dim1.setRowsPerBlock((int) dim1.getRows());
if (dim1.getColsPerBlock() > dim1.getCols())
dim1.setColsPerBlock((int) dim1.getCols());
if (dim2.getRowsPerBlock() > dim2.getRows())
dim2.setRowsPerBlock((int) dim2.getRows());
if (dim2.getColsPerBlock() > dim2.getCols())
dim2.setColsPerBlock((int) dim2.getCols());
long blockSize1 = 77 + 8 * dim1.getRowsPerBlock() * dim1.getColsPerBlock();
long blockSize2 = 77 + 8 * dim2.getRowsPerBlock() * dim2.getColsPerBlock();
long blockSizeResult = 77 + 8 * dim1.getRowsPerBlock() * dim2.getColsPerBlock();
long cacheSize = -1;
//cache the first result
if (dim1.getRows() < dim2.getCols()) {
long numBlocks = (long) Math.ceil((double) dim1.getRows() / (double) dim1.getRowsPerBlock());
cacheSize = numBlocks * (20 + blockSize1) + 32;
} else //cache the second result
{
long numBlocks = (long) Math.ceil((double) dim2.getCols() / (double) dim2.getColsPerBlock());
cacheSize = numBlocks * (20 + blockSize2) + 32;
}
//add known memory consumption (will be substracted from output buffer)
cacheSize += //the cached key-value pair (plus input instance)
2 * Math.max(blockSize1, blockSize2) + //the cached single result
blockSizeResult + //misc memory requirement by hadoop
MRJobConfiguration.getMiscMemRequired(job);
MRJobConfiguration.setMMCJCacheSize(job, (int) cacheSize);
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
//run mmcj job
RunningJob runjob = JobClient.runJob(job);
/* Process different counters */
// NOTE: MMCJ job always has only a single output.
// Hence, no need to scan resultIndexes[] like other jobs
int outputIndex = 0;
Byte outputMatrixID = MRInstructionParser.parseSingleInstruction(aggBinInstrction).output;
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
// number of non-zeros
stats[outputIndex].setNonZeros(group.getCounter(Byte.toString(outputMatrixID)));
return new JobReturn(stats[outputIndex], outputinfo, runjob.isSuccessful());
}
use of org.apache.hadoop.mapred.Counters.Group in project incubator-systemml by apache.
the class CSVReblockMR method runCSVReblockJob.
private static JobReturn runCSVReblockJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos, Path counterFile, String[] smallestFiles) throws Exception {
JobConf job;
job = new JobConf(ReblockMR.class);
job.setJobName("CSV-Reblock-MR");
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
//set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.CELL);
job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles);
//set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
//set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
//set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions);
//set up the instructions that will happen in the reducer, after the aggregation instrucions
MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
//set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
//set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
//set up what matrices are needed to pass from the mapper to reducer
HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null, reblockInstructions, null, otherInstructionsInReducer, resultIndexes);
MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null, reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
MatrixCharacteristics[] stats = ret.stats;
//set up the number of reducers
int numRed = WriteCSVMR.determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups);
job.setNumReduceTasks(numRed);
// Print the complete instruction
//if (LOG.isTraceEnabled())
// inst.printCompelteMRJobInstruction(stats);
// Update resultDimsUnknown based on computed "stats"
byte[] resultDimsUnknown = new byte[resultIndexes.length];
for (int i = 0; i < resultIndexes.length; i++) {
if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
resultDimsUnknown[i] = (byte) 1;
} else {
resultDimsUnknown[i] = (byte) 0;
}
}
//set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true);
// configure mapper and the mapper output key value pairs
job.setMapperClass(CSVReblockMapper.class);
job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
job.setMapOutputValueClass(BlockRow.class);
//configure reducer
job.setReducerClass(CSVReblockReducer.class);
//turn off adaptivemr
job.setBoolean("adaptivemr.map.enable", false);
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
Path cachefile = new Path(counterFile, "part-00000");
DistributedCache.addCacheFile(cachefile.toUri(), job);
DistributedCache.createSymlink(job);
job.set(ROWID_FILE_NAME, cachefile.toString());
RunningJob runjob = JobClient.runJob(job);
MapReduceTool.deleteFileIfExistOnHDFS(counterFile, job);
/* Process different counters */
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
// number of non-zeros
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
// System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]);
}
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
use of org.apache.hadoop.mapred.Counters.Group in project incubator-systemml by apache.
the class CSVReblockMR method runAssignRowIDMRJob.
public static AssignRowIDMRReturn runAssignRowIDMRJob(String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, String reblockInstructions, int replication, String[] smallestFiles, boolean transform, String naStrings, String spec) throws Exception {
AssignRowIDMRReturn ret = new AssignRowIDMRReturn();
JobConf job;
job = new JobConf(CSVReblockMR.class);
job.setJobName("Assign-RowID-MR");
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
//set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.CELL);
job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles);
//set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions);
//set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
//set up the number of reducers
job.setNumReduceTasks(1);
// Print the complete instruction
//if (LOG.isTraceEnabled())
//inst.printCompelteMRJobInstruction();
// configure mapper and the mapper output key value pairs
job.setMapperClass(CSVAssignRowIDMapper.class);
job.setMapOutputKeyClass(ByteWritable.class);
job.setMapOutputValueClass(OffsetCount.class);
//configure reducer
job.setReducerClass(CSVAssignRowIDReducer.class);
//turn off adaptivemr
job.setBoolean("adaptivemr.map.enable", false);
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
//set up the output file
ret.counterFile = new Path(MRJobConfiguration.constructTempOutputFilename());
job.setOutputFormat(SequenceFileOutputFormat.class);
FileOutputFormat.setOutputPath(job, ret.counterFile);
job.setOutputKeyClass(ByteWritable.class);
job.setOutputValueClass(OffsetCount.class);
// setup properties relevant to transform
job.setBoolean(MRJobConfiguration.TF_TRANSFORM, transform);
if (transform) {
if (naStrings != null)
// Adding "dummy" string to handle the case of na_strings = ""
job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(naStrings));
job.set(MRJobConfiguration.TF_SPEC, spec);
}
RunningJob runjob = JobClient.runJob(job);
/* Process different counters */
Group rgroup = runjob.getCounters().getGroup(NUM_ROWS_IN_MATRIX);
Group cgroup = runjob.getCounters().getGroup(NUM_COLS_IN_MATRIX);
ret.rlens = new long[inputs.length];
ret.clens = new long[inputs.length];
for (int i = 0; i < inputs.length; i++) {
// number of non-zeros
ret.rlens[i] = rgroup.getCounter(Integer.toString(i));
ret.clens[i] = cgroup.getCounter(Integer.toString(i));
}
return ret;
}
use of org.apache.hadoop.mapred.Counters.Group in project hive by apache.
the class HiveHistoryImpl method setTaskCounters.
@Override
public void setTaskCounters(String queryId, String taskId, Counters ctrs) {
String id = queryId + ":" + taskId;
QueryInfo ji = queryInfoMap.get(queryId);
StringBuilder sb1 = new StringBuilder("");
TaskInfo ti = taskInfoMap.get(id);
if ((ti == null) || (ctrs == null)) {
return;
}
StringBuilder sb = new StringBuilder("");
try {
boolean first = true;
for (Group group : ctrs) {
for (Counter counter : group) {
if (first) {
first = false;
} else {
sb.append(',');
}
sb.append(group.getDisplayName());
sb.append('.');
sb.append(counter.getDisplayName());
sb.append(':');
sb.append(counter.getCounter());
String tab = getRowCountTableName(counter.getDisplayName());
if (tab != null) {
if (sb1.length() > 0) {
sb1.append(",");
}
sb1.append(tab);
sb1.append('~');
sb1.append(counter.getCounter());
ji.rowCountMap.put(tab, counter.getCounter());
}
}
}
} catch (Exception e) {
LOG.warn(org.apache.hadoop.util.StringUtils.stringifyException(e));
}
if (sb1.length() > 0) {
taskInfoMap.get(id).hm.put(Keys.ROWS_INSERTED.name(), sb1.toString());
queryInfoMap.get(queryId).hm.put(Keys.ROWS_INSERTED.name(), sb1.toString());
}
if (sb.length() > 0) {
taskInfoMap.get(id).hm.put(Keys.TASK_COUNTERS.name(), sb.toString());
}
}
Aggregations