use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.
the class MMRJMR method runJob.
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String instructionsInMapper, String aggInstructionsInReducer, String aggBinInstrctions, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception {
JobConf job = new JobConf(MMRJMR.class);
job.setJobName("MMRJ-MR");
if (numReducers <= 0)
throw new Exception("MMRJ-MR has to have at least one reduce task!");
// TODO: check w/ yuanyuan. This job always runs in blocked mode, and hence derivation is not necessary.
boolean inBlockRepresentation = MRJobConfiguration.deriveRepresentation(inputInfos);
// whether use block representation or cell representation
MRJobConfiguration.setMatrixValueClass(job, inBlockRepresentation);
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
// set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true, inBlockRepresentation ? ConvertTarget.BLOCK : ConvertTarget.CELL);
// set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
// set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
// set up unary instructions that will perform in the mapper
MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
// set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
// set up the aggregate binary operation for the mmcj job
MRJobConfiguration.setAggregateBinaryInstructions(job, aggBinInstrctions);
// set up the instructions that will happen in the reducer, after the aggregation instrucions
MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
// set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set up map/reduce memory configurations (if in AM context)
DMLConfig config = ConfigurationManager.getDMLConfig();
DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
// set up custom map/reduce configurations
MRJobConfiguration.setupCustomMRConfigurations(job, config);
// byte[] resultIndexes=new byte[]{AggregateBinaryInstruction.parseMRInstruction(aggBinInstrction).output};
// set up what matrices are needed to pass from the mapper to reducer
HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, aggBinInstrctions, resultIndexes);
MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, aggInstructionsInReducer, aggBinInstrctions, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
MatrixCharacteristics[] stats = ret.stats;
// set up the number of reducers
MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
byte[] dimsUnknown = new byte[resultIndexes.length];
for (int i = 0; i < resultIndexes.length; i++) {
if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
dimsUnknown[i] = (byte) 1;
} else {
dimsUnknown[i] = (byte) 0;
}
}
// set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, dimsUnknown, outputs, outputInfos, inBlockRepresentation);
// configure mapper
job.setMapperClass(MMRJMRMapper.class);
job.setMapOutputKeyClass(TripleIndexes.class);
if (inBlockRepresentation)
job.setMapOutputValueClass(TaggedMatrixBlock.class);
else
job.setMapOutputValueClass(TaggedMatrixCell.class);
job.setOutputKeyComparatorClass(TripleIndexes.Comparator.class);
job.setPartitionerClass(TripleIndexes.FirstTwoIndexesPartitioner.class);
// configure combiner
// TODO: cannot set up combiner, because it will destroy the stable numerical algorithms
// for sum or for central moments
// if(aggInstructionsInReducer!=null && !aggInstructionsInReducer.isEmpty())
// job.setCombinerClass(MMCJMRCombiner.class);
// configure reducer
job.setReducerClass(MMRJMRReducer.class);
// By default, the job executes in "cluster" mode.
// Determine if we can optimize and run it in "local" mode.
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
RunningJob runjob = JobClient.runJob(job);
/* Process different counters */
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
// number of non-zeros
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.
the class ReblockMR method runJob.
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, long[] nnz, String instructionsInMapper, String reblockInstructions, String otherInstructionsInReducer, int numReducers, int replication, boolean jvmReuse, byte[] resultIndexes, String[] outputs, OutputInfo[] outputInfos) throws Exception {
JobConf job = new JobConf(ReblockMR.class);
job.setJobName("Reblock-MR");
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
// set up the input files and their format information
// (internally used input converters: text2bc for text, identity for binary inputs)
MRJobConfiguration.setUpMultipleInputsReblock(job, realIndexes, inputs, inputInfos, brlens, bclens);
// set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens, nnz);
// set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
// set up unary instructions that will perform in the mapper
MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
// set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setReblockInstructions(job, reblockInstructions);
// set up the instructions that will happen in the reducer, after the aggregation instrucions
MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
// set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// disable automatic tasks timeouts and speculative task exec
job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
job.setMapSpeculativeExecution(false);
// set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
// set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
// enable jvm reuse (based on SystemML configuration)
if (jvmReuse)
job.setNumTasksToExecutePerJvm(-1);
// set up what matrices are needed to pass from the mapper to reducer
HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, instructionsInMapper, reblockInstructions, null, otherInstructionsInReducer, resultIndexes);
MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, instructionsInMapper, reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
MatrixCharacteristics[] stats = ret.stats;
// set up the number of reducers (according to output size)
int numRed = determineNumReducers(rlens, clens, nnz, config.getIntValue(DMLConfig.NUM_REDUCERS), ret.numReducerGroups);
job.setNumReduceTasks(numRed);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
// Update resultDimsUnknown based on computed "stats"
byte[] resultDimsUnknown = new byte[resultIndexes.length];
for (int i = 0; i < resultIndexes.length; i++) {
if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
resultDimsUnknown[i] = (byte) 1;
} else {
resultDimsUnknown[i] = (byte) 0;
}
}
// set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true);
// configure mapper and the mapper output key value pairs
job.setMapperClass(ReblockMapper.class);
// represent key offsets for block
job.setMapOutputKeyClass(MatrixIndexes.class);
// binary cell/block
job.setMapOutputValueClass(TaggedAdaptivePartialBlock.class);
// configure reducer
job.setReducerClass(ReblockReducer.class);
// By default, the job executes in "cluster" mode.
// Determine if we can optimize and run it in "local" mode.
// at this point, both reblock_binary and reblock_text are similar
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
RunningJob runjob = JobClient.runJob(job);
/* Process different counters */
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.
the class SortMR method runJob.
@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen, int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication, String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception {
boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes;
String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output;
JobConf job = new JobConf(SortMR.class);
job.setJobName("SortMR");
// setup partition file
String pfname = MRJobConfiguration.setUpSortPartitionFilename(job);
Path partitionFile = new Path(pfname);
URI partitionUri = new URI(partitionFile.toString());
// setup input/output paths
Path inputDir = new Path(input);
inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
FileInputFormat.setInputPaths(job, inputDir);
Path outpath = new Path(tmpOutput);
FileOutputFormat.setOutputPath(job, outpath);
MapReduceTool.deleteFileIfExistOnHDFS(outpath, job);
// set number of reducers (1 if local mode)
if (!InfrastructureAnalyzer.isLocalMode(job)) {
MRJobConfiguration.setNumReducers(job, numReducers, numReducers);
// on cp-side qpick instructions for quantile/iqm/median (~128MB)
if (!(getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes))
job.setNumReduceTasks((int) Math.max(job.getNumReduceTasks(), rlen / 10000000));
} else
// in case of local mode
job.setNumReduceTasks(1);
// setup input/output format
job.setInputFormat(SamplingSortMRInputFormat.class);
SamplingSortMRInputFormat.setTargetKeyValueClasses(job, (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass);
// setup instructions and meta information
if (combineInst != null && !combineInst.trim().isEmpty())
job.set(COMBINE_INSTRUCTION, combineInst);
job.set(SORT_INSTRUCTION, sortInst);
job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight);
boolean desc = getSortInstructionDescending(sortInst);
job.setBoolean(SORT_DECREASING, desc);
MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen);
MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile);
// setup mapper/reducer/partitioner/output classes
if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) {
MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
job.setMapperClass(IndexSortMapper.class);
job.setReducerClass(IndexSortReducer.class);
job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class);
job.setMapOutputValueClass(LongWritable.class);
job.setOutputKeyClass(MatrixIndexes.class);
job.setOutputValueClass(MatrixBlock.class);
} else {
// default case: SORT w/wo weights
MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
job.setOutputFormat(CompactOutputFormat.class);
job.setMapperClass(ValueSortMapper.class);
job.setReducerClass(ValueSortReducer.class);
// double
job.setOutputKeyClass(outputInfo.outputKeyClass);
// int
job.setOutputValueClass(outputInfo.outputValueClass);
}
job.setPartitionerClass(TotalOrderPartitioner.class);
// setup distributed cache
DistributedCache.addCacheFile(partitionUri, job);
DistributedCache.createSymlink(job);
// setup replication factor
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
MatrixCharacteristics[] s = new MatrixCharacteristics[1];
s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(s);
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
// run mr job
RunningJob runjob = JobClient.runJob(job);
Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX);
numReducers = job.getNumReduceTasks();
// process final meta data
long[] counts = new long[numReducers];
long total = 0;
for (int i = 0; i < numReducers; i++) {
counts[i] = group.getCounter(Integer.toString(i));
total += counts[i];
}
// add missing 0s back to the results
long missing0s = 0;
if (total < rlen * clen) {
if (partitionWith0 < 0)
throw new RuntimeException("no partition contains 0, which is wrong!");
missing0s = rlen * clen - total;
counts[partitionWith0] += missing0s;
} else
partitionWith0 = -1;
if (sortIndexes) {
// run builtin job for shifting partially sorted blocks according to global offsets
// we do this in this custom form since it would not fit into the current structure
// of systemml to output two intermediates (partially sorted data, offsets) out of a
// single SortKeys lop
boolean success = runjob.isSuccessful();
if (success) {
success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication, output);
}
MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput);
MapReduceTool.deleteFileIfExistOnHDFS(pfname);
return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success);
} else {
MapReduceTool.deleteFileIfExistOnHDFS(pfname);
return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful());
}
}
use of org.apache.hadoop.mapred.RunningJob in project incubator-systemml by apache.
the class WriteCSVMR method runJob.
public static JobReturn runJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos, long[] rlens, long[] clens, int[] brlens, int[] bclens, String csvWriteInstructions, int numReducers, int replication, byte[] resultIndexes, String[] outputs) throws Exception {
JobConf job = new JobConf(WriteCSVMR.class);
job.setJobName("WriteCSV-MR");
// check for valid output dimensions
for (int i = 0; i < rlens.length; i++) if (rlens[i] == 0 || clens[i] == 0)
throw new IOException("Write of matrices with zero" + " rows or columns not supported (" + rlens[i] + "x" + clens[i] + ").");
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
// set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, true, ConvertTarget.CSVWRITE);
// set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
// set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
MRJobConfiguration.setCSVWriteInstructions(job, csvWriteInstructions);
// set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
// set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
long maxRlen = 0;
for (long rlen : rlens) if (rlen > maxRlen)
maxRlen = rlen;
// set up the number of reducers (according to output size)
int numRed = determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS), (int) maxRlen);
job.setNumReduceTasks(numRed);
byte[] resultDimsUnknown = new byte[resultIndexes.length];
MatrixCharacteristics[] stats = new MatrixCharacteristics[resultIndexes.length];
OutputInfo[] outputInfos = new OutputInfo[outputs.length];
HashMap<Byte, Integer> indexmap = new HashMap<>();
for (int i = 0; i < stats.length; i++) {
indexmap.put(resultIndexes[i], i);
resultDimsUnknown[i] = (byte) 0;
stats[i] = new MatrixCharacteristics();
outputInfos[i] = OutputInfo.CSVOutputInfo;
}
CSVWriteInstruction[] ins = MRInstructionParser.parseCSVWriteInstructions(csvWriteInstructions);
for (CSVWriteInstruction in : ins) stats[indexmap.get(in.output)].set(rlens[in.input], clens[in.input], -1, -1);
// Print the complete instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
// set up what matrices are needed to pass from the mapper to reducer
MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, "", "", csvWriteInstructions, resultIndexes);
// set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, true);
// configure mapper and the mapper output key value pairs
job.setMapperClass(CSVWriteMapper.class);
job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
job.setMapOutputValueClass(MatrixBlock.class);
// configure reducer
job.setReducerClass(CSVWriteReducer.class);
job.setOutputKeyComparatorClass(TaggedFirstSecondIndexes.Comparator.class);
job.setPartitionerClass(TaggedFirstSecondIndexes.FirstIndexRangePartitioner.class);
// job.setOutputFormat(UnPaddedOutputFormat.class);
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
RunningJob runjob = JobClient.runJob(job);
/* Process different counters */
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
// number of non-zeros
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
use of org.apache.hadoop.mapred.RunningJob in project systemml by apache.
the class RemoteDPParForMR method runJob.
public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program, // config params
String resultFile, // config params
MatrixObject input, // config params
PartitionFormat dpf, // config params
OutputInfo oi, // config params
boolean tSparseCol, // opt params
boolean enableCPCaching, // opt params
int numReducers, // opt params
int replication) {
RemoteParForJobReturn ret = null;
String jobname = "ParFor-DPEMR";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
JobConf job;
job = new JobConf(RemoteDPParForMR.class);
job.setJobName(jobname + pfid);
// maintain dml script counters
Statistics.incrementNoOfCompiledMRJobs();
try {
// ///
// configure the MR job
// set arbitrary CP program blocks that will perform in the reducers
MRJobConfiguration.setProgramBlocks(job, program);
// enable/disable caching
MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);
// setup input matrix
Path path = new Path(input.getFileName());
long rlen = input.getNumRows();
long clen = input.getNumColumns();
int brlen = (int) input.getNumRowsPerBlock();
int bclen = (int) input.getNumColumnsPerBlock();
MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo, oi, dpf._dpf, dpf._N, input.getFileName(), itervar, matrixvar, tSparseCol);
job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass);
FileInputFormat.setInputPaths(job, path);
// set mapper and reducers classes
job.setMapperClass(DataPartitionerRemoteMapper.class);
job.setReducerClass(RemoteDPParWorkerReducer.class);
// set output format
job.setOutputFormat(SequenceFileOutputFormat.class);
// set output path
MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
FileOutputFormat.setOutputPath(job, new Path(resultFile));
// set the output key, value schema
// parfor partitioning outputs (intermediates)
job.setMapOutputKeyClass(LongWritable.class);
if (oi == OutputInfo.BinaryBlockOutputInfo)
job.setMapOutputValueClass(PairWritableBlock.class);
else if (oi == OutputInfo.BinaryCellOutputInfo)
job.setMapOutputValueClass(PairWritableCell.class);
else
throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi);
// parfor exec output
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
// ////
// set optimization parameters
// set the number of mappers and reducers
job.setNumReduceTasks(numReducers);
// disable automatic tasks timeouts and speculative task exec
job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
job.setMapSpeculativeExecution(false);
// set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
// set up map/reduce memory configurations (if in AM context)
DMLConfig config = ConfigurationManager.getDMLConfig();
DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
// set up custom map/reduce configurations
MRJobConfiguration.setupCustomMRConfigurations(job, config);
// disable JVM reuse
// -1 for unlimited
job.setNumTasksToExecutePerJvm(1);
// set the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
// set the max number of retries per map task
// note: currently disabled to use cluster config
// job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, max_retry);
// set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
// ///
// execute the MR job
RunningJob runjob = JobClient.runJob(job);
// Process different counters
Statistics.incrementNoOfExecutedMRJobs();
Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
CacheStatistics.incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
CacheStatistics.incrementFSBuffHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
CacheStatistics.incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
CacheStatistics.incrementHDFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
CacheStatistics.incrementFSBuffWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
CacheStatistics.incrementFSWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
CacheStatistics.incrementHDFSWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
CacheStatistics.incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
CacheStatistics.incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
CacheStatistics.incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
CacheStatistics.incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
}
// read all files of result variables and prepare for return
LocalVariableMap[] results = readResultFile(job, resultFile);
ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
} finally {
// remove created files
try {
MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
} catch (IOException ex) {
throw new DMLRuntimeException(ex);
}
}
if (DMLScript.STATISTICS) {
long t1 = System.nanoTime();
Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
}
return ret;
}
Aggregations