use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.
the class RemoteParForMR method runJob.
public static //inputs
RemoteParForJobReturn runJob(//inputs
long pfid, //inputs
String program, //inputs
String taskFile, //inputs
String resultFile, //inputs
MatrixObject colocatedDPMatrixObj, //opt params
boolean enableCPCaching, //opt params
int numMappers, //opt params
int replication, //opt params
int max_retry, //opt params
long minMem, //opt params
boolean jvmReuse) throws DMLRuntimeException {
RemoteParForJobReturn ret = null;
String jobname = "ParFor-EMR";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
JobConf job;
job = new JobConf(RemoteParForMR.class);
job.setJobName(jobname + pfid);
//maintain dml script counters
Statistics.incrementNoOfCompiledMRJobs();
try {
/////
//configure the MR job
//set arbitrary CP program blocks that will perform in the mapper
MRJobConfiguration.setProgramBlocks(job, program);
//enable/disable caching
MRJobConfiguration.setParforCachingConfig(job, enableCPCaching);
//set mappers, reducers, combiners
//map-only
job.setMapperClass(RemoteParWorkerMapper.class);
//set input format (one split per row, NLineInputFormat default N=1)
if (ParForProgramBlock.ALLOW_DATA_COLOCATION && colocatedDPMatrixObj != null) {
job.setInputFormat(RemoteParForColocatedNLineInputFormat.class);
MRJobConfiguration.setPartitioningFormat(job, colocatedDPMatrixObj.getPartitionFormat());
MatrixCharacteristics mc = colocatedDPMatrixObj.getMatrixCharacteristics();
MRJobConfiguration.setPartitioningBlockNumRows(job, mc.getRowsPerBlock());
MRJobConfiguration.setPartitioningBlockNumCols(job, mc.getColsPerBlock());
MRJobConfiguration.setPartitioningFilename(job, colocatedDPMatrixObj.getFileName());
} else //default case
{
job.setInputFormat(NLineInputFormat.class);
}
//set the input path and output path
FileInputFormat.setInputPaths(job, new Path(taskFile));
//set output format
job.setOutputFormat(SequenceFileOutputFormat.class);
//set output path
MapReduceTool.deleteFileIfExistOnHDFS(resultFile);
FileOutputFormat.setOutputPath(job, new Path(resultFile));
//set the output key, value schema
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
//////
//set optimization parameters
//set the number of mappers and reducers
//numMappers
job.setNumMapTasks(numMappers);
job.setNumReduceTasks(0);
//job.setInt("mapred.map.tasks.maximum", 1); //system property
//job.setInt("mapred.tasktracker.tasks.maximum",1); //system property
//job.setInt("mapred.jobtracker.maxtasks.per.job",1); //system property
//set jvm memory size (if require)
String memKey = MRConfigurationNames.MR_CHILD_JAVA_OPTS;
if (minMem > 0 && minMem > InfrastructureAnalyzer.extractMaxMemoryOpt(job.get(memKey))) {
InfrastructureAnalyzer.setMaxMemoryOpt(job, memKey, minMem);
LOG.warn("Forcing '" + memKey + "' to -Xmx" + minMem / (1024 * 1024) + "M.");
}
//disable automatic tasks timeouts and speculative task exec
job.setInt(MRConfigurationNames.MR_TASK_TIMEOUT, 0);
job.setMapSpeculativeExecution(false);
//set up map/reduce memory configurations (if in AM context)
DMLConfig config = ConfigurationManager.getDMLConfig();
DMLAppMasterUtils.setupMRJobRemoteMaxMemory(job, config);
//set up custom map/reduce configurations
MRJobConfiguration.setupCustomMRConfigurations(job, config);
//enables the reuse of JVMs (multiple tasks per MR task)
if (jvmReuse)
//unlimited
job.setNumTasksToExecutePerJvm(-1);
//set sort io buffer (reduce unnecessary large io buffer, guaranteed memory consumption)
//8MB
job.setInt(MRConfigurationNames.MR_TASK_IO_SORT_MB, 8);
//set the replication factor for the results
job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);
//set the max number of retries per map task
// disabled job-level configuration to respect cluster configuration
// note: this refers to hadoop2, hence it never had effect on mr1
//job.setInt(MRConfigurationNames.MR_MAP_MAXATTEMPTS, max_retry);
//set unique working dir
MRJobConfiguration.setUniqueWorkingDir(job);
/////
// execute the MR job
RunningJob runjob = JobClient.runJob(job);
// Process different counters
Statistics.incrementNoOfExecutedMRJobs();
Group pgroup = runjob.getCounters().getGroup(ParForProgramBlock.PARFOR_COUNTER_GROUP_NAME);
int numTasks = (int) pgroup.getCounter(Stat.PARFOR_NUMTASKS.toString());
int numIters = (int) pgroup.getCounter(Stat.PARFOR_NUMITERS.toString());
if (DMLScript.STATISTICS && !InfrastructureAnalyzer.isLocalMode()) {
Statistics.incrementJITCompileTime(pgroup.getCounter(Stat.PARFOR_JITCOMPILE.toString()));
Statistics.incrementJVMgcCount(pgroup.getCounter(Stat.PARFOR_JVMGC_COUNT.toString()));
Statistics.incrementJVMgcTime(pgroup.getCounter(Stat.PARFOR_JVMGC_TIME.toString()));
Group cgroup = runjob.getCounters().getGroup(CacheableData.CACHING_COUNTER_GROUP_NAME.toString());
CacheStatistics.incrementMemHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_MEM.toString()));
CacheStatistics.incrementFSBuffHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FSBUFF.toString()));
CacheStatistics.incrementFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_FS.toString()));
CacheStatistics.incrementHDFSHits((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_HITS_HDFS.toString()));
CacheStatistics.incrementFSBuffWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FSBUFF.toString()));
CacheStatistics.incrementFSWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_FS.toString()));
CacheStatistics.incrementHDFSWrites((int) cgroup.getCounter(CacheStatistics.Stat.CACHE_WRITES_HDFS.toString()));
CacheStatistics.incrementAcquireRTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQR.toString()));
CacheStatistics.incrementAcquireMTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_ACQM.toString()));
CacheStatistics.incrementReleaseTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_RLS.toString()));
CacheStatistics.incrementExportTime(cgroup.getCounter(CacheStatistics.Stat.CACHE_TIME_EXP.toString()));
}
// read all files of result variables and prepare for return
LocalVariableMap[] results = readResultFile(job, resultFile);
ret = new RemoteParForJobReturn(runjob.isSuccessful(), numTasks, numIters, results);
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
} finally {
// remove created files
try {
MapReduceTool.deleteFileIfExistOnHDFS(new Path(taskFile), job);
MapReduceTool.deleteFileIfExistOnHDFS(new Path(resultFile), job);
} catch (IOException ex) {
throw new DMLRuntimeException(ex);
}
}
if (DMLScript.STATISTICS) {
long t1 = System.nanoTime();
Statistics.maintainCPHeavyHitters("MR-Job_" + jobname, t1 - t0);
}
return ret;
}
use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.
the class DataTransform method mrDataTransform.
/**
* Main method to create and/or apply transformation metdata using MapReduce.
*
* @param jobinst MR job instruction
* @param inputs array of input matrices
* @param shuffleInst shuffle instructions
* @param otherInst other instructions
* @param resultIndices byte array of result indices
* @param outputs array of output matrices
* @param numReducers number of reducers
* @param replication ?
* @return MR job result
* @throws Exception if IOException occurs
*/
public static JobReturn mrDataTransform(MRJobInstruction jobinst, MatrixObject[] inputs, String shuffleInst, String otherInst, byte[] resultIndices, MatrixObject[] outputs, int numReducers, int replication) throws Exception {
String[] insts = shuffleInst.split(Instruction.INSTRUCTION_DELIM);
// Parse transform instruction (the first instruction) to obtain relevant fields
TransformOperands oprnds = new TransformOperands(insts[0], inputs[0]);
JobConf job = new JobConf(ConfigurationManager.getCachedJobConf());
// find the first file in alphabetical ordering of part files in directory inputPath
String smallestFile = CSVReblockMR.findSmallestFile(job, oprnds.inputPath);
// find column names
FileSystem fs = IOUtilFunctions.getFileSystem(smallestFile);
String headerLine = readHeaderLine(fs, oprnds.inputCSVProperties, smallestFile);
HashMap<String, Integer> colNamesToIds = processColumnNames(fs, oprnds.inputCSVProperties, headerLine, smallestFile);
String outHeader = getOutputHeader(fs, headerLine, oprnds);
int numColumns = colNamesToIds.size();
int numColumnsTf = 0;
long numRowsTf = 0;
ArrayList<Integer> csvoutputs = new ArrayList<Integer>();
ArrayList<Integer> bboutputs = new ArrayList<Integer>();
// divide output objects based on output format (CSV or BinaryBlock)
for (int i = 0; i < outputs.length; i++) {
if (outputs[i].getFileFormatProperties() != null && outputs[i].getFileFormatProperties().getFileFormat() == FileFormatProperties.FileFormat.CSV)
csvoutputs.add(i);
else
bboutputs.add(i);
}
boolean isCSV = (csvoutputs.size() > 0);
boolean isBB = (bboutputs.size() > 0);
String tmpPath = MRJobConfiguration.constructTempOutputFilename();
checkIfOutputOverlapsWithTxMtd(outputs, oprnds, isCSV, isBB, csvoutputs, bboutputs, fs);
JobReturn retCSV = null, retBB = null;
if (!oprnds.isApply) {
// build specification file with column IDs insteadof column names
String specWithIDs = processSpecFile(fs, oprnds.inputPath, smallestFile, colNamesToIds, oprnds.inputCSVProperties, oprnds.spec);
// enable GC on colNamesToIds
colNamesToIds = null;
// Build transformation metadata, including recode maps, bin definitions, etc.
// Also, generate part offsets file (counters file), which is to be used in csv-reblock
String partOffsetsFile = MRJobConfiguration.constructTempOutputFilename();
numRowsTf = GenTfMtdMR.runJob(oprnds.inputPath, oprnds.txMtdPath, specWithIDs, smallestFile, partOffsetsFile, oprnds.inputCSVProperties, numColumns, replication, outHeader);
if (numRowsTf == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// store the specFileWithIDs as transformation metadata
MapReduceTool.writeStringToHDFS(specWithIDs, oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
// Apply transformation metadata, and perform actual transformation
if (isCSV)
retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[csvoutputs.get(0)].getFileName(), partOffsetsFile, oprnds.inputCSVProperties, numColumns, replication, outHeader);
if (isBB) {
DMLConfig conf = ConfigurationManager.getDMLConfig();
int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize }, new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
if (ret1.rlens[0] == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, numRowsTf, numColumns, numColumnsTf, replication, outHeader);
}
MapReduceTool.deleteFileIfExistOnHDFS(new Path(partOffsetsFile), job);
} else {
// enable GC on colNamesToIds
colNamesToIds = null;
// copy given transform metadata (applyTxPath) to specified location (txMtdPath)
MapReduceTool.deleteFileIfExistOnHDFS(new Path(oprnds.txMtdPath), job);
MapReduceTool.copyFileOnHDFS(oprnds.applyTxPath, oprnds.txMtdPath);
// path to specification file
String specWithIDs = (oprnds.spec != null) ? oprnds.spec : MapReduceTool.readStringFromHDFSFile(oprnds.txMtdPath + "/" + "spec.json");
numColumnsTf = getNumColumnsTf(fs, outHeader, oprnds.inputCSVProperties.getDelim(), oprnds.txMtdPath);
if (isCSV) {
DMLConfig conf = ConfigurationManager.getDMLConfig();
int blockSize = conf.getIntValue(DMLConfig.DEFAULT_BLOCK_SIZE);
CSVReblockInstruction rblk = prepDummyReblockInstruction(oprnds.inputCSVProperties, blockSize);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { blockSize }, new int[] { blockSize }, rblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
numRowsTf = ret1.rlens[0];
if (ret1.rlens[0] == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// Apply transformation metadata, and perform actual transformation
retCSV = ApplyTfCSVMR.runJob(oprnds.inputPath, specWithIDs, oprnds.applyTxPath, tmpPath, outputs[csvoutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, numColumns, replication, outHeader);
}
if (isBB) {
// compute part offsets file
CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(insts[1]);
CSVReblockInstruction newrblk = (CSVReblockInstruction) rblk.clone((byte) 0);
AssignRowIDMRReturn ret1 = CSVReblockMR.runAssignRowIDMRJob(new String[] { oprnds.inputPath }, new InputInfo[] { InputInfo.CSVInputInfo }, new int[] { newrblk.brlen }, new int[] { newrblk.bclen }, newrblk.toString(), replication, new String[] { smallestFile }, true, oprnds.inputCSVProperties.getNAStrings(), specWithIDs);
numRowsTf = ret1.rlens[0];
if (ret1.rlens[0] == 0)
throw new DMLRuntimeException(ERROR_MSG_ZERO_ROWS);
// apply transformation metadata, as well as reblock the resulting data
retBB = ApplyTfBBMR.runJob(oprnds.inputPath, insts[1], otherInst, specWithIDs, oprnds.txMtdPath, tmpPath, outputs[bboutputs.get(0)].getFileName(), ret1.counterFile.toString(), oprnds.inputCSVProperties, ret1.rlens[0], ret1.clens[0], numColumnsTf, replication, outHeader);
}
}
// copy auxiliary data (old and new header lines) from temporary location to txMtdPath
moveFilesFromTmp(fs, tmpPath, oprnds.txMtdPath);
// generate matrix metadata file for outputs
if (retCSV != null) {
retCSV.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
CSVFileFormatProperties prop = new CSVFileFormatProperties(false, // use the same header as the input
oprnds.inputCSVProperties.getDelim(), false, Double.NaN, null);
MapReduceTool.writeMetaDataFile(outputs[csvoutputs.get(0)].getFileName() + ".mtd", ValueType.DOUBLE, retCSV.getMatrixCharacteristics(0), OutputInfo.CSVOutputInfo, prop);
return retCSV;
}
if (retBB != null) {
retBB.getMatrixCharacteristics(0).setDimension(numRowsTf, numColumnsTf);
MapReduceTool.writeMetaDataFile(outputs[bboutputs.get(0)].getFileName() + ".mtd", ValueType.DOUBLE, retBB.getMatrixCharacteristics(0), OutputInfo.BinaryBlockOutputInfo);
return retBB;
}
return null;
}
use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.
the class DMLScript method launchDebugger.
/**
* Launcher for DML debugger. This method should be called after
* execution and debug properties have been correctly set, and customized parameters
*
* @param dmlScriptStr DML script contents (including new lines)
* @param fnameOptConfig Full path of configuration file for SystemML
* @param argVals Key-value pairs defining arguments of DML script
* @param scriptType type of script (DML or PyDML)
* @throws ParseException if ParseException occurs
* @throws IOException if IOException occurs
* @throws DMLRuntimeException if DMLRuntimeException occurs
* @throws DMLDebuggerException if DMLDebuggerException occurs
* @throws LanguageException if LanguageException occurs
* @throws HopsException if HopsException occurs
* @throws LopsException if LopsException occurs
*/
private static void launchDebugger(String dmlScriptStr, String fnameOptConfig, Map<String, String> argVals, ScriptType scriptType) throws ParseException, IOException, DMLRuntimeException, DMLDebuggerException, LanguageException, HopsException, LopsException {
DMLDebuggerProgramInfo dbprog = new DMLDebuggerProgramInfo();
//Step 1: parse configuration files
DMLConfig conf = DMLConfig.readConfigurationFile(fnameOptConfig);
ConfigurationManager.setGlobalConfig(conf);
//Step 2: parse dml script
ParserWrapper parser = ParserFactory.createParser(scriptType);
DMLProgram prog = parser.parse(DML_FILE_PATH_ANTLR_PARSER, dmlScriptStr, argVals);
//Step 3: construct HOP DAGs (incl LVA and validate)
DMLTranslator dmlt = new DMLTranslator(prog);
dmlt.liveVariableAnalysis(prog);
dmlt.validateParseTree(prog);
dmlt.constructHops(prog);
//Step 4: rewrite HOP DAGs (incl IPA and memory estimates)
dmlt.rewriteHopsDAG(prog);
//Step 5: construct LOP DAGs
dmlt.constructLops(prog);
//Step 6: generate runtime program
dbprog.rtprog = prog.getRuntimeProgram(conf);
try {
//set execution environment
initHadoopExecution(conf);
//initialize an instance of SystemML debugger
DMLDebugger SystemMLdb = new DMLDebugger(dbprog, dmlScriptStr);
//run SystemML debugger
SystemMLdb.runSystemMLDebugger();
} finally {
//cleanup scratch_space and all working dirs
cleanupHadoopExecution(conf);
}
}
use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.
the class ScriptExecutorUtils method executeRuntimeProgram.
/**
* Execute the runtime program. This involves execution of the program
* blocks that make up the runtime program and may involve dynamic
* recompilation.
*
* @param se
* script executor
* @param statisticsMaxHeavyHitters
* maximum number of statistics to print
* @throws DMLRuntimeException
* if exception occurs
*/
public static void executeRuntimeProgram(ScriptExecutor se, int statisticsMaxHeavyHitters) throws DMLRuntimeException {
Program prog = se.getRuntimeProgram();
ExecutionContext ec = se.getExecutionContext();
DMLConfig config = se.getConfig();
executeRuntimeProgram(prog, ec, config, statisticsMaxHeavyHitters);
}
use of org.apache.sysml.conf.DMLConfig in project incubator-systemml by apache.
the class ProgramConverter method parseParForBody.
////////////////////////////////
// PARSING
////////////////////////////////
public static ParForBody parseParForBody(String in, int id) throws DMLRuntimeException {
ParForBody body = new ParForBody();
//header elimination
//normalization
String tmpin = in.replaceAll(NEWLINE, "");
//remove start/end
tmpin = tmpin.substring(PARFORBODY_BEGIN.length(), tmpin.length() - PARFORBODY_END.length());
HierarchyAwareStringTokenizer st = new HierarchyAwareStringTokenizer(tmpin, COMPONENTS_DELIM);
//handle DMLScript UUID (NOTE: set directly in DMLScript)
//(master UUID is used for all nodes (in order to simply cleanup))
DMLScript.setUUID(st.nextToken());
//handle DML config (NOTE: set directly in ConfigurationManager)
String confStr = st.nextToken();
JobConf job = ConfigurationManager.getCachedJobConf();
if (!InfrastructureAnalyzer.isLocalMode(job)) {
if (confStr != null && !confStr.trim().isEmpty()) {
DMLConfig dmlconf = DMLConfig.parseDMLConfig(confStr);
CompilerConfig cconf = OptimizerUtils.constructCompilerConfig(dmlconf);
ConfigurationManager.setLocalConfig(dmlconf);
ConfigurationManager.setLocalConfig(cconf);
}
//init internal configuration w/ parsed or default config
ParForProgramBlock.initInternalConfigurations(ConfigurationManager.getDMLConfig());
}
//handle additional configs
String aconfs = st.nextToken();
parseAndSetAdditionalConfigurations(aconfs);
//handle program
String progStr = st.nextToken();
Program prog = parseProgram(progStr, id);
//handle result variable names
String rvarStr = st.nextToken();
ArrayList<String> rvars = parseStringArrayList(rvarStr);
body.setResultVarNames(rvars);
//handle execution context
String ecStr = st.nextToken();
ExecutionContext ec = parseExecutionContext(ecStr, prog);
//handle program blocks
String spbs = st.nextToken();
ArrayList<ProgramBlock> pbs = rParseProgramBlocks(spbs, prog, id);
body.setChildBlocks(pbs);
body.setEc(ec);
return body;
}
Aggregations