use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.
the class CSVReblockMR method runAssignRowIDMRJob.
public static AssignRowIDMRReturn runAssignRowIDMRJob(String[] inputs, InputInfo[] inputInfos, int[] brlens, int[] bclens, String reblockInstructions, int replication, String[] smallestFiles, boolean transform, String naStrings, String spec) throws Exception {
AssignRowIDMRReturn ret = new AssignRowIDMRReturn();
JobConf job;
job = new JobConf(CSVReblockMR.class);
job.setJobName("Assign-RowID-MR");
byte[] realIndexes = new byte[inputs.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
//set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.CELL);
job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles);
//set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions);
//set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
//set up the number of reducers
job.setNumReduceTasks(1);
// Print the complete instruction
//if (LOG.isTraceEnabled())
//inst.printCompelteMRJobInstruction();
// configure mapper and the mapper output key value pairs
job.setMapperClass(CSVAssignRowIDMapper.class);
job.setMapOutputKeyClass(ByteWritable.class);
job.setMapOutputValueClass(OffsetCount.class);
//configure reducer
job.setReducerClass(CSVAssignRowIDReducer.class);
//turn off adaptivemr
job.setBoolean("adaptivemr.map.enable", false);
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
//set up the output file
ret.counterFile = new Path(MRJobConfiguration.constructTempOutputFilename());
job.setOutputFormat(SequenceFileOutputFormat.class);
FileOutputFormat.setOutputPath(job, ret.counterFile);
job.setOutputKeyClass(ByteWritable.class);
job.setOutputValueClass(OffsetCount.class);
// setup properties relevant to transform
job.setBoolean(MRJobConfiguration.TF_TRANSFORM, transform);
if (transform) {
if (naStrings != null)
// Adding "dummy" string to handle the case of na_strings = ""
job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(naStrings));
job.set(MRJobConfiguration.TF_SPEC, spec);
}
RunningJob runjob = JobClient.runJob(job);
/* Process different counters */
Group rgroup = runjob.getCounters().getGroup(NUM_ROWS_IN_MATRIX);
Group cgroup = runjob.getCounters().getGroup(NUM_COLS_IN_MATRIX);
ret.rlens = new long[inputs.length];
ret.clens = new long[inputs.length];
for (int i = 0; i < inputs.length; i++) {
// number of non-zeros
ret.rlens[i] = rgroup.getCounter(Integer.toString(i));
ret.clens[i] = cgroup.getCounter(Integer.toString(i));
}
return ret;
}
use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.
the class ParForDependencyAnalysisTest method runTest.
/**
*
* @param scriptFilename
* @param expectedException
*/
private void runTest(String scriptFilename, boolean expectedException) {
boolean raisedException = false;
try {
// Tell the superclass about the name of this test, so that the superclass can
// create temporary directories.
int index = scriptFilename.lastIndexOf(".dml");
String testName = scriptFilename.substring(0, index > 0 ? index : scriptFilename.length());
TestConfiguration testConfig = new TestConfiguration(TEST_CLASS_DIR, testName, new String[] {});
addTestConfiguration(testName, testConfig);
loadTestConfiguration(testConfig);
DMLConfig conf = new DMLConfig(getCurConfigFile().getPath());
ConfigurationManager.setLocalConfig(conf);
String dmlScriptString = "";
HashMap<String, String> argVals = new HashMap<String, String>();
//read script
try (BufferedReader in = new BufferedReader(new FileReader(HOME + scriptFilename))) {
String s1 = null;
while ((s1 = in.readLine()) != null) dmlScriptString += s1 + "\n";
}
//parsing and dependency analysis
ParserWrapper parser = ParserFactory.createParser(org.apache.sysml.api.mlcontext.ScriptType.DML);
DMLProgram prog = parser.parse(DMLScript.DML_FILE_PATH_ANTLR_PARSER, dmlScriptString, argVals);
DMLTranslator dmlt = new DMLTranslator(prog);
dmlt.validateParseTree(prog);
} catch (LanguageException ex) {
raisedException = true;
if (raisedException != expectedException)
ex.printStackTrace();
} catch (Exception ex2) {
ex2.printStackTrace();
throw new RuntimeException(ex2);
//Assert.fail( "Unexpected exception occured during test run." );
}
//check correctness
Assert.assertEquals(expectedException, raisedException);
}
use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.
the class RemoteDPParForMR method runJob.
public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program, //config params
String resultFile, //config params
MatrixObject input, //config params
PartitionFormat dpf, //config params
OutputInfo oi, //config params
boolean tSparseCol, //opt params
boolean enableCPCaching, //opt params
int numReducers, //opt params
int replication) throws DMLRuntimeException {
RemoteParForJobReturn ret = null;
String jobname = "ParFor-DPEMR";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
JobConf job;
job = new JobConf(RemoteDPParForMR.class);
job.setJobName(jobname + pfid);
//maintain dml script counters
Statistics.incrementNoOfCompiledMRJobs();
try {
/////
//configure the MR job
//set arbitrary CP program blocks that will perform in the reducers
MRJobConfiguration.setProgramBlocks(job, program);
//enable/disable caching
MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);
//setup input matrix
Path path = new Path(input.getFileName());
long rlen = input.getNumRows();
long clen = input.getNumColumns();
int brlen = (int) input.getNumRowsPerBlock();
int bclen = (int) input.getNumColumnsPerBlock();
MRJobConfiguration.setPartitioningInfo(job, rlen, clen, brlen, bclen, InputInfo.BinaryBlockInputInfo, oi, dpf._dpf, dpf._N, input.getFileName(), itervar, matrixvar, tSparseCol);
job.setInputFormat(InputInfo.BinaryBlockInputInfo.inputFormatClass);
FileInputFormat.setInputPaths(job, path);
//set mapper and reducers classes
job.setMapperClass(DataPartitionerRemoteMapper.class);
job.setReducerClass(RemoteDPParWorkerReducer.class);
//set output format
job.setOutputFormat(SequenceFileOutputFormat.class);
//set output path
MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
FileOutputFormat.setOutputPath(job, new Path(resultFile));
//set the output key, value schema
//parfor partitioning outputs (intermediates)
job.setMapOutputKeyClass(LongWritable.class);
if (oi == OutputInfo.BinaryBlockOutputInfo)
job.setMapOutputValueClass(PairWritableBlock.class);
else if (oi == OutputInfo.BinaryCellOutputInfo)
job.setMapOutputValueClass(PairWritableCell.class);
else
throw new DMLRuntimeException("Unsupported intermrediate output info: " + oi);
//parfor exec output
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
//////
//set optimization parameters
//set the number of mappers and reducers
job.setNumReduceTasks(numReducers);
//disable automatic tasks timeouts and speculative task exec
job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
job.setMapSpeculativeExecution(false);
//set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
//set up map/reduce memory configurations (if in AM context)
DMLConfig config = ConfigurationManager.getDMLConfig();
DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
//set up custom map/reduce configurations
MRJobConfiguration.setupCustomMRConfigurations(job, config);
//disable JVM reuse
//-1 for unlimited
job.setNumTasksToExecutePerJvm(1);
//set the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set the max number of retries per map task
//note: currently disabled to use cluster config
//job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, max_retry);
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
/////
// execute the MR job
RunningJob runjob = JobClient.runJob(job);
// Process different counters
Statistics.incrementNoOfExecutedMRJobs();
Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
CacheStatistics.incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
CacheStatistics.incrementFSBuffHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
CacheStatistics.incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
CacheStatistics.incrementHDFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
CacheStatistics.incrementFSBuffWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
CacheStatistics.incrementFSWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
CacheStatistics.incrementHDFSWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
CacheStatistics.incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
CacheStatistics.incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
CacheStatistics.incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
CacheStatistics.incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
}
// read all files of result variables and prepare for return
LocalVariableMap[] results = readResultFile(job, resultFile);
ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
} finally {
// remove created files
try {
MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
} catch (IOException ex) {
throw new DMLRuntimeException(ex);
}
}
if (DMLScript.STATISTICS) {
long t1 = System.nanoTime();
Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
}
return ret;
}
use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.
the class ResultMergeRemoteMR method executeMerge.
@SuppressWarnings({ "unused", "deprecation" })
protected void executeMerge(String fname, String fnameNew, String[] srcFnames, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
String jobname = "ParFor-RMMR";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
JobConf job = new JobConf(ResultMergeRemoteMR.class);
job.setJobName(jobname + _pfid);
//maintain dml script counters
Statistics.incrementNoOfCompiledMRJobs();
//warning for textcell/binarycell without compare
boolean withCompare = (fname != null);
if ((oi == OutputInfo.TextCellOutputInfo || oi == OutputInfo.BinaryCellOutputInfo) && !withCompare && ResultMergeLocalFile.ALLOW_COPY_CELLFILES)
LOG.warn("Result merge for " + OutputInfo.outputInfoToString(oi) + " without compare can be realized more efficiently with LOCAL_FILE than REMOTE_MR.");
try {
Path pathCompare = null;
Path pathNew = new Path(fnameNew);
//configure the MR job
if (withCompare) {
FileSystem fs = IOUtilFunctions.getFileSystem(pathNew, job);
pathCompare = new Path(fname).makeQualified(fs);
MRJobConfiguration.setResultMergeInfo(job, pathCompare.toString(), ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, brlen, bclen);
} else
MRJobConfiguration.setResultMergeInfo(job, "null", ii, LocalFileUtils.getWorkingDir(LocalFileUtils.CATEGORY_RESULTMERGE), rlen, clen, bclen, bclen);
//set mappers, reducers, combiners
job.setMapperClass(ResultMergeRemoteMapper.class);
job.setReducerClass(ResultMergeRemoteReducer.class);
if (oi == OutputInfo.TextCellOutputInfo) {
job.setMapOutputKeyClass(MatrixIndexes.class);
job.setMapOutputValueClass(TaggedMatrixCell.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(Text.class);
} else if (oi == OutputInfo.BinaryCellOutputInfo) {
job.setMapOutputKeyClass(MatrixIndexes.class);
job.setMapOutputValueClass(TaggedMatrixCell.class);
job.setOutputKeyClass(MatrixIndexes.class);
job.setOutputValueClass(MatrixCell.class);
} else if (oi == OutputInfo.BinaryBlockOutputInfo) {
//setup partitioning, grouping, sorting for composite key (old API)
//partitioning
job.setPartitionerClass(ResultMergeRemotePartitioning.class);
//grouping
job.setOutputValueGroupingComparator(ResultMergeRemoteGrouping.class);
//sorting
job.setOutputKeyComparatorClass(ResultMergeRemoteSorting.class);
job.setMapOutputKeyClass(ResultMergeTaggedMatrixIndexes.class);
job.setMapOutputValueClass(TaggedMatrixBlock.class);
job.setOutputKeyClass(MatrixIndexes.class);
job.setOutputValueClass(MatrixBlock.class);
}
//set input format
job.setInputFormat(ii.inputFormatClass);
//set the input path
Path[] paths = null;
if (withCompare) {
paths = new Path[srcFnames.length + 1];
paths[0] = pathCompare;
for (int i = 1; i < paths.length; i++) paths[i] = new Path(srcFnames[i - 1]);
} else {
paths = new Path[srcFnames.length];
for (int i = 0; i < paths.length; i++) paths[i] = new Path(srcFnames[i]);
}
FileInputFormat.setInputPaths(job, paths);
//set output format
job.setOutputFormat(oi.outputFormatClass);
//set output path
MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
FileOutputFormat.setOutputPath(job, pathNew);
//////
//set optimization parameters
//set the number of mappers and reducers
//job.setNumMapTasks( _numMappers ); //use default num mappers
long reducerGroups = _numReducers;
if (oi == OutputInfo.BinaryBlockOutputInfo)
reducerGroups = Math.max(rlen / brlen, 1) * Math.max(clen / bclen, 1);
else
//textcell/binarycell
reducerGroups = Math.max((rlen * clen) / StagingFileUtils.CELL_BUFFER_SIZE, 1);
job.setNumReduceTasks((int) Math.min(_numReducers, reducerGroups));
//disable automatic tasks timeouts and speculative task exec
job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
job.setMapSpeculativeExecution(false);
//set up preferred custom serialization framework for binary block format
if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
MRJobConfiguration.addBinaryBlockSerializationFramework(job);
//set up custom map/reduce configurations
DMLConfig config = ConfigurationManager.getDMLConfig();
MRJobConfiguration.setupCustomMRConfigurations(job, config);
//enables the reuse of JVMs (multiple tasks per MR task)
if (_jvmReuse)
//unlimited
job.setNumTasksToExecutePerJvm(-1);
//enables compression - not conclusive for different codecs (empirically good compression ratio, but significantly slower)
//job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS, "true");
//job.set(MRConfigurationNames.MR_MAP_OUTPUT_COMPRESS_CODEC, "org.apache.hadoop.io.compress.GzipCodec");
//set the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, _replication);
//set the max number of retries per map task
// disabled job-level configuration to respect cluster configuration
// note: this refers to hadoop2, hence it never had effect on mr1
//job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, _max_retry);
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
/////
// execute the MR job
JobClient.runJob(job);
//maintain dml script counters
Statistics.incrementNoOfExecutedMRJobs();
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
if (DMLScript.STATISTICS) {
long t1 = System.nanoTime();
Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
}
}
use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.
the class DataGenMR method runJob.
/**
* <p>Starts a Rand MapReduce job which will produce one or more random objects.</p>
*
* @param inst MR job instruction
* @param dataGenInstructions array of data gen instructions
* @param instructionsInMapper instructions in mapper
* @param aggInstructionsInReducer aggregate instructions in reducer
* @param otherInstructionsInReducer other instructions in reducer
* @param numReducers number of reducers
* @param replication file replication
* @param resultIndexes result indexes for each random object
* @param dimsUnknownFilePrefix file path prefix when dimensions unknown
* @param outputs output file for each random object
* @param outputInfos output information for each random object
* @return matrix characteristics for each random object
* @throws Exception if Exception occurs
*/
public static JobReturn runJob(MRJobInstruction inst, String[] dataGenInstructions, String instructionsInMapper, String aggInstructionsInReducer, String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes, String dimsUnknownFilePrefix, String[] outputs, OutputInfo[] outputInfos) throws Exception {
JobConf job = new JobConf(DataGenMR.class);
job.setJobName("DataGen-MR");
//whether use block representation or cell representation
MRJobConfiguration.setMatrixValueClass(job, true);
byte[] realIndexes = new byte[dataGenInstructions.length];
for (byte b = 0; b < realIndexes.length; b++) realIndexes[b] = b;
String[] inputs = new String[dataGenInstructions.length];
InputInfo[] inputInfos = new InputInfo[dataGenInstructions.length];
long[] rlens = new long[dataGenInstructions.length];
long[] clens = new long[dataGenInstructions.length];
int[] brlens = new int[dataGenInstructions.length];
int[] bclens = new int[dataGenInstructions.length];
FileSystem fs = FileSystem.get(job);
String dataGenInsStr = "";
int numblocks = 0;
int maxbrlen = -1, maxbclen = -1;
double maxsparsity = -1;
for (int i = 0; i < dataGenInstructions.length; i++) {
dataGenInsStr = dataGenInsStr + Lop.INSTRUCTION_DELIMITOR + dataGenInstructions[i];
MRInstruction mrins = MRInstructionParser.parseSingleInstruction(dataGenInstructions[i]);
MRINSTRUCTION_TYPE mrtype = mrins.getMRInstructionType();
DataGenMRInstruction genInst = (DataGenMRInstruction) mrins;
rlens[i] = genInst.getRows();
clens[i] = genInst.getCols();
brlens[i] = genInst.getRowsInBlock();
bclens[i] = genInst.getColsInBlock();
maxbrlen = Math.max(maxbrlen, brlens[i]);
maxbclen = Math.max(maxbclen, bclens[i]);
if (mrtype == MRINSTRUCTION_TYPE.Rand) {
RandInstruction randInst = (RandInstruction) mrins;
inputs[i] = LibMatrixDatagen.generateUniqueSeedPath(genInst.getBaseDir());
maxsparsity = Math.max(maxsparsity, randInst.getSparsity());
PrintWriter pw = null;
try {
pw = new PrintWriter(fs.create(new Path(inputs[i])));
//for obj reuse and preventing repeated buffer re-allocations
StringBuilder sb = new StringBuilder();
//seed generation
Well1024a bigrand = LibMatrixDatagen.setupSeedsForRand(randInst.getSeed());
LongStream nnz = LibMatrixDatagen.computeNNZperBlock(rlens[i], clens[i], brlens[i], bclens[i], randInst.getSparsity());
PrimitiveIterator.OfLong nnzIter = nnz.iterator();
for (long r = 0; r < rlens[i]; r += brlens[i]) {
long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
for (long c = 0; c < clens[i]; c += bclens[i]) {
long curBlockColSize = Math.min(bclens[i], (clens[i] - c));
sb.append((r / brlens[i]) + 1);
sb.append(',');
sb.append((c / bclens[i]) + 1);
sb.append(',');
sb.append(curBlockRowSize);
sb.append(',');
sb.append(curBlockColSize);
sb.append(',');
sb.append(nnzIter.nextLong());
sb.append(',');
sb.append(bigrand.nextLong());
pw.println(sb.toString());
sb.setLength(0);
numblocks++;
}
}
} finally {
IOUtilFunctions.closeSilently(pw);
}
inputInfos[i] = InputInfo.TextCellInputInfo;
} else if (mrtype == MRINSTRUCTION_TYPE.Seq) {
SeqInstruction seqInst = (SeqInstruction) mrins;
inputs[i] = genInst.getBaseDir() + System.currentTimeMillis() + ".seqinput";
//always dense
maxsparsity = 1.0;
double from = seqInst.fromValue;
double to = seqInst.toValue;
double incr = seqInst.incrValue;
//handle default 1 to -1 for special case of from>to
incr = LibMatrixDatagen.updateSeqIncr(from, to, incr);
// Correctness checks on (from, to, incr)
boolean neg = (from > to);
if (incr == 0)
throw new DMLRuntimeException("Invalid value for \"increment\" in seq().");
if (neg != (incr < 0))
throw new DMLRuntimeException("Wrong sign for the increment in a call to seq()");
// Compute the number of rows in the sequence
long numrows = 1 + (long) Math.floor((to - from) / incr);
if (rlens[i] > 0) {
if (numrows != rlens[i])
throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Expected number of rows does not match given number: " + rlens[i] + " != " + numrows);
} else {
rlens[i] = numrows;
}
if (clens[i] > 0 && clens[i] != 1)
throw new DMLRuntimeException("Unexpected error while processing sequence instruction. Number of columns (" + clens[i] + ") must be equal to 1.");
else
clens[i] = 1;
PrintWriter pw = null;
try {
pw = new PrintWriter(fs.create(new Path(inputs[i])));
StringBuilder sb = new StringBuilder();
double temp = from;
double block_from, block_to;
for (long r = 0; r < rlens[i]; r += brlens[i]) {
long curBlockRowSize = Math.min(brlens[i], (rlens[i] - r));
// block (bid_i,bid_j) generates a sequence from the interval [block_from, block_to] (inclusive of both end points of the interval)
long bid_i = ((r / brlens[i]) + 1);
long bid_j = 1;
block_from = temp;
block_to = temp + (curBlockRowSize - 1) * incr;
// next block starts from here
temp = block_to + incr;
sb.append(bid_i);
sb.append(',');
sb.append(bid_j);
sb.append(',');
sb.append(block_from);
sb.append(',');
sb.append(block_to);
sb.append(',');
sb.append(incr);
pw.println(sb.toString());
sb.setLength(0);
numblocks++;
}
} finally {
IOUtilFunctions.closeSilently(pw);
}
inputInfos[i] = InputInfo.TextCellInputInfo;
} else {
throw new DMLRuntimeException("Unexpected Data Generation Instruction Type: " + mrtype);
}
}
//remove the first ","
dataGenInsStr = dataGenInsStr.substring(1);
RunningJob runjob;
MatrixCharacteristics[] stats;
try {
//set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
//set up the input files and their format information
MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false, ConvertTarget.BLOCK);
//set up the dimensions of input matrices
MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
MRJobConfiguration.setDimsUnknownFilePrefix(job, dimsUnknownFilePrefix);
//set up the block size
MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);
//set up the rand Instructions
MRJobConfiguration.setRandInstructions(job, dataGenInsStr);
//set up unary instructions that will perform in the mapper
MRJobConfiguration.setInstructionsInMapper(job, instructionsInMapper);
//set up the aggregate instructions that will happen in the combiner and reducer
MRJobConfiguration.setAggregateInstructions(job, aggInstructionsInReducer);
//set up the instructions that will happen in the reducer, after the aggregation instrucions
MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);
//set up the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set up map/reduce memory configurations (if in AM context)
DMLConfig config = ConfigurationManager.getDMLConfig();
DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
//set up custom map/reduce configurations
MRJobConfiguration.setupCustomMRConfigurations(job, config);
//determine degree of parallelism (nmappers: 1<=n<=capacity)
//TODO use maxsparsity whenever we have a way of generating sparse rand data
int capacity = InfrastructureAnalyzer.getRemoteParallelMapTasks();
long dfsblocksize = InfrastructureAnalyzer.getHDFSBlockSize();
//correction max number of mappers on yarn clusters
if (InfrastructureAnalyzer.isYarnEnabled())
capacity = (int) Math.max(capacity, YarnClusterAnalyzer.getNumCores());
int nmapers = Math.max(Math.min((int) (8 * maxbrlen * maxbclen * (long) numblocks / dfsblocksize), capacity), 1);
job.setNumMapTasks(nmapers);
//set up what matrices are needed to pass from the mapper to reducer
HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, otherInstructionsInReducer, resultIndexes);
MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, dataGenInsStr, instructionsInMapper, null, aggInstructionsInReducer, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes, false);
stats = ret.stats;
//set up the number of reducers
MRJobConfiguration.setNumReducers(job, ret.numReducerGroups, numReducers);
// print the complete MRJob instruction
if (LOG.isTraceEnabled())
inst.printCompleteMRJobInstruction(stats);
// Update resultDimsUnknown based on computed "stats"
byte[] resultDimsUnknown = new byte[resultIndexes.length];
for (int i = 0; i < resultIndexes.length; i++) {
if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
resultDimsUnknown[i] = (byte) 1;
} else {
resultDimsUnknown[i] = (byte) 0;
}
}
boolean mayContainCtable = instructionsInMapper.contains("ctabletransform") || instructionsInMapper.contains("groupedagg");
//set up the multiple output files, and their format information
MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true, mayContainCtable);
// configure mapper and the mapper output key value pairs
job.setMapperClass(DataGenMapper.class);
if (numReducers == 0) {
job.setMapOutputKeyClass(Writable.class);
job.setMapOutputValueClass(Writable.class);
} else {
job.setMapOutputKeyClass(MatrixIndexes.class);
job.setMapOutputValueClass(TaggedMatrixBlock.class);
}
//set up combiner
if (numReducers != 0 && aggInstructionsInReducer != null && !aggInstructionsInReducer.isEmpty())
job.setCombinerClass(GMRCombiner.class);
//configure reducer
job.setReducerClass(GMRReducer.class);
//job.setReducerClass(PassThroughReducer.class);
// By default, the job executes in "cluster" mode.
// Determine if we can optimize and run it in "local" mode.
MatrixCharacteristics[] inputStats = new MatrixCharacteristics[inputs.length];
for (int i = 0; i < inputs.length; i++) {
inputStats[i] = new MatrixCharacteristics(rlens[i], clens[i], brlens[i], bclens[i]);
}
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
runjob = JobClient.runJob(job);
/* Process different counters */
Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
for (int i = 0; i < resultIndexes.length; i++) {
// number of non-zeros
stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
}
String dir = dimsUnknownFilePrefix + "/" + runjob.getID().toString() + "_dimsFile";
stats = MapReduceTool.processDimsFiles(dir, stats);
MapReduceTool.deleteFileIfExistOnHDFS(dir);
} finally {
for (String input : inputs) MapReduceTool.deleteFileIfExistOnHDFS(new Path(input), job);
}
return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}
Aggregations