use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.
the class PlusMultSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) throws DMLRuntimeException {
SparkExecutionContext sec = (SparkExecutionContext) ec;
//pass the scalar
ScalarObject constant = (ScalarObject) ec.getScalarInput(input3.getName(), input3.getValueType(), input3.isLiteral());
((ValueFunctionWithConstant) ((BinaryOperator) _optr).fn).setConstant(constant.getDoubleValue());
super.processMatrixMatrixBinaryInstruction(sec);
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.
the class ResultMergeRemoteSpark method executeMerge.
@SuppressWarnings("unchecked")
protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, String varname, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
String jobname = "ParFor-RMSP";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
SparkExecutionContext sec = (SparkExecutionContext) _ec;
boolean withCompare = (compare != null);
RDDObject ret = null;
//determine degree of parallelism
int numRed = (int) determineNumReducers(rlen, clen, brlen, bclen, _numReducers);
//sanity check for empty src files
if (inputs == null || inputs.length == 0)
throw new DMLRuntimeException("Execute merge should never be called with no inputs.");
try {
//note: initial implementation via union over all result rdds discarded due to
//stack overflow errors with many parfor tasks, and thus many rdds
//Step 1: construct input rdd from all result files of parfor workers
//a) construct job conf with all files
InputInfo ii = InputInfo.BinaryBlockInputInfo;
JobConf job = new JobConf(ResultMergeRemoteMR.class);
job.setJobName(jobname);
job.setInputFormat(ii.inputFormatClass);
Path[] paths = new Path[inputs.length];
for (int i = 0; i < paths.length; i++) {
//ensure input exists on hdfs (e.g., if in-memory or RDD)
inputs[i].exportData();
paths[i] = new Path(inputs[i].getFileName());
//update rdd handle to allow lazy evaluation by guarding
//against cleanup of temporary result files
setRDDHandleForMerge(inputs[i], sec);
}
FileInputFormat.setInputPaths(job, paths);
//b) create rdd from input files w/ deep copy of keys and blocks
JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext().hadoopRDD(job, ii.inputFormatClass, ii.inputKeyClass, ii.inputValueClass).mapPartitionsToPair(new CopyBlockPairFunction(true), true);
//Step 2a: merge with compare
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
if (withCompare) {
JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(compare, InputInfo.BinaryBlockInputInfo);
//merge values which differ from compare values
ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare();
out = //group all result blocks per key
rdd.groupByKey(numRed).join(//join compare block and result blocks
compareRdd).mapToPair(//merge result blocks w/ compare
cfun);
} else //Step 2b: merge without compare
{
//direct merge in any order (disjointness guaranteed)
out = RDDAggregateUtils.mergeByKey(rdd, false);
}
//Step 3: create output rdd handle w/ lineage
ret = new RDDObject(out, varname);
for (int i = 0; i < paths.length; i++) ret.addLineageChild(inputs[i].getRDDHandle());
if (withCompare)
ret.addLineageChild(compare.getRDDHandle());
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
//maintain statistics
Statistics.incrementNoOfCompiledSPInst();
Statistics.incrementNoOfExecutedSPInst();
if (DMLScript.STATISTICS) {
Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
}
return ret;
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.
the class DMLScript method execute.
// /////////////////////////////
// private internal interface
// (core compilation and execute)
// //////
/**
* The running body of DMLScript execution. This method should be called after execution properties have been correctly set,
* and customized parameters have been put into _argVals
*
* @param dmlScriptStr DML script string
* @param fnameOptConfig configuration file
* @param argVals map of argument values
* @param allArgs arguments
* @param scriptType type of script (DML or PyDML)
* @throws IOException if IOException occurs
*/
private static void execute(String dmlScriptStr, String fnameOptConfig, Map<String, String> argVals, String[] allArgs, ScriptType scriptType) throws IOException {
SCRIPT_TYPE = scriptType;
// print basic time and environment info
printStartExecInfo(dmlScriptStr);
// Step 1: parse configuration files & write any configuration specific global variables
DMLConfig dmlconf = DMLConfig.readConfigurationFile(fnameOptConfig);
ConfigurationManager.setGlobalConfig(dmlconf);
CompilerConfig cconf = OptimizerUtils.constructCompilerConfig(dmlconf);
ConfigurationManager.setGlobalConfig(cconf);
LOG.debug("\nDML config: \n" + dmlconf.getConfigInfo());
// Sets the GPUs to use for this process (a range, all GPUs, comma separated list or a specific GPU)
GPUContextPool.AVAILABLE_GPUS = dmlconf.getTextValue(DMLConfig.AVAILABLE_GPUS);
String evictionPolicy = dmlconf.getTextValue(DMLConfig.GPU_EVICTION_POLICY).toUpperCase();
try {
DMLScript.GPU_EVICTION_POLICY = EvictionPolicy.valueOf(evictionPolicy);
} catch (IllegalArgumentException e) {
throw new RuntimeException("Unsupported eviction policy:" + evictionPolicy);
}
// Step 2: set local/remote memory if requested (for compile in AM context)
if (dmlconf.getBooleanValue(DMLConfig.YARN_APPMASTER)) {
DMLAppMasterUtils.setupConfigRemoteMaxMemory(dmlconf);
}
// Step 3: parse dml script
Statistics.startCompileTimer();
ParserWrapper parser = ParserFactory.createParser(scriptType);
DMLProgram prog = parser.parse(DML_FILE_PATH_ANTLR_PARSER, dmlScriptStr, argVals);
// Step 4: construct HOP DAGs (incl LVA, validate, and setup)
DMLTranslator dmlt = new DMLTranslator(prog);
dmlt.liveVariableAnalysis(prog);
dmlt.validateParseTree(prog);
dmlt.constructHops(prog);
// init working directories (before usage by following compilation steps)
initHadoopExecution(dmlconf);
// Step 5: rewrite HOP DAGs (incl IPA and memory estimates)
dmlt.rewriteHopsDAG(prog);
// Step 6: construct lops (incl exec type and op selection)
dmlt.constructLops(prog);
if (LOG.isDebugEnabled()) {
LOG.debug("\n********************** LOPS DAG *******************");
dmlt.printLops(prog);
dmlt.resetLopsDAGVisitStatus(prog);
}
// Step 7: generate runtime program, incl codegen
Program rtprog = dmlt.getRuntimeProgram(prog, dmlconf);
// launch SystemML appmaster (if requested and not already in launched AM)
if (dmlconf.getBooleanValue(DMLConfig.YARN_APPMASTER)) {
if (!isActiveAM() && DMLYarnClientProxy.launchDMLYarnAppmaster(dmlScriptStr, dmlconf, allArgs, rtprog))
// if AM launch unsuccessful, fall back to normal execute
return;
if (// in AM context (not failed AM launch)
isActiveAM())
DMLAppMasterUtils.setupProgramMappingRemoteMaxMemory(rtprog);
}
// Step 9: prepare statistics [and optional explain output]
// count number compiled MR jobs / SP instructions
ExplainCounts counts = Explain.countDistributedOperations(rtprog);
Statistics.resetNoOfCompiledJobs(counts.numJobs);
// explain plan of program (hops or runtime)
if (EXPLAIN != ExplainType.NONE)
LOG.info(Explain.display(prog, rtprog, EXPLAIN, counts));
Statistics.stopCompileTimer();
// double costs = CostEstimationWrapper.getTimeEstimate(rtprog, ExecutionContextFactory.createContext());
// System.out.println("Estimated costs: "+costs);
// Step 10: execute runtime program
ExecutionContext ec = null;
try {
ec = ExecutionContextFactory.createContext(rtprog);
ScriptExecutorUtils.executeRuntimeProgram(rtprog, ec, dmlconf, STATISTICS ? STATISTICS_COUNT : 0);
} finally {
if (ec != null && ec instanceof SparkExecutionContext)
((SparkExecutionContext) ec).close();
LOG.info("END DML run " + getDateTime());
// cleanup scratch_space and all working dirs
cleanupHadoopExecution(dmlconf);
}
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.
the class FrameConverterTest method runConverter.
@SuppressWarnings("unchecked")
private static void runConverter(ConvType type, MatrixCharacteristics mc, MatrixCharacteristics mcMatrix, List<ValueType> schema, String fnameIn, String fnameOut) throws IOException {
SparkExecutionContext sec = (SparkExecutionContext) ExecutionContextFactory.createContext();
JavaSparkContext sc = sec.getSparkContext();
ValueType[] lschema = schema.toArray(new ValueType[0]);
MapReduceTool.deleteFileIfExistOnHDFS(fnameOut);
switch(type) {
case CSV2BIN:
{
InputInfo iinfo = InputInfo.CSVInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.csvToBinaryBlock(sc, rddIn, mc, null, false, separator, false, 0).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2CSV:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
CSVFileFormatProperties fprop = new CSVFileFormatProperties();
JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToCsv(rddIn2, mc, fprop, true);
rddOut.saveAsTextFile(fnameOut);
break;
}
case TXTCELL2BIN:
{
InputInfo iinfo = InputInfo.TextCellInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<LongWritable, Text> rddIn = (JavaPairRDD<LongWritable, Text>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.textCellToBinaryBlock(sc, rddIn, mc, lschema).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2TXTCELL:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
JavaPairRDD<LongWritable, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class);
JavaPairRDD<Long, FrameBlock> rddIn2 = rddIn.mapToPair(new CopyFrameBlockPairFunction(false));
JavaRDD<String> rddOut = FrameRDDConverterUtils.binaryBlockToTextCell(rddIn2, mc);
rddOut.saveAsTextFile(fnameOut);
break;
}
case MAT2BIN:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<MatrixIndexes, MatrixBlock> rddIn = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sc.hadoopFile(fnameIn, iinfo.inputFormatClass, iinfo.inputKeyClass, iinfo.inputValueClass);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.matrixBlockToBinaryBlock(sc, rddIn, mcMatrix);
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2MAT:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
JavaPairRDD<MatrixIndexes, MatrixBlock> rddOut = FrameRDDConverterUtils.binaryBlockToMatrixBlock(rddIn, mc, mcMatrix);
rddOut.saveAsHadoopFile(fnameOut, MatrixIndexes.class, MatrixBlock.class, oinfo.outputFormatClass);
break;
}
case DFRM2BIN:
{
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
// Create DataFrame
SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
StructType dfSchema = FrameRDDConverterUtils.convertFrameSchemaToDFSchema(lschema, false);
JavaRDD<Row> rowRDD = FrameRDDConverterUtils.csvToRowRDD(sc, fnameIn, separator, lschema);
Dataset<Row> df = sparkSession.createDataFrame(rowRDD, dfSchema);
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, false).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
case BIN2DFRM:
{
InputInfo iinfo = InputInfo.BinaryBlockInputInfo;
OutputInfo oinfo = OutputInfo.BinaryBlockOutputInfo;
JavaPairRDD<Long, FrameBlock> rddIn = sc.hadoopFile(fnameIn, iinfo.inputFormatClass, LongWritable.class, FrameBlock.class).mapToPair(new LongWritableFrameToLongFrameFunction());
SparkSession sparkSession = SparkSession.builder().sparkContext(sc.sc()).getOrCreate();
Dataset<Row> df = FrameRDDConverterUtils.binaryBlockToDataFrame(sparkSession, rddIn, mc, lschema);
// Convert back DataFrame to binary block for comparison using original binary to converted DF and back to binary
JavaPairRDD<LongWritable, FrameBlock> rddOut = FrameRDDConverterUtils.dataFrameToBinaryBlock(sc, df, mc, true).mapToPair(new LongFrameToLongWritableFrameFunction());
rddOut.saveAsHadoopFile(fnameOut, LongWritable.class, FrameBlock.class, oinfo.outputFormatClass);
break;
}
default:
throw new RuntimeException("Unsuported converter type: " + type.toString());
}
sec.close();
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.
the class DataPartitionerRemoteSpark method partitionMatrix.
@Override
@SuppressWarnings("unchecked")
protected void partitionMatrix(MatrixObject in, String fnameNew, InputInfo ii, OutputInfo oi, long rlen, long clen, int brlen, int bclen) {
String jobname = "ParFor-DPSP";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
SparkExecutionContext sec = (SparkExecutionContext) _ec;
try {
// cleanup existing output files
MapReduceTool.deleteFileIfExistOnHDFS(fnameNew);
// get input rdd
JavaPairRDD<MatrixIndexes, MatrixBlock> inRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(in, InputInfo.BinaryBlockInputInfo);
// determine degree of parallelism
MatrixCharacteristics mc = in.getMatrixCharacteristics();
int numRed = (int) determineNumReducers(inRdd, mc, _numRed);
// run spark remote data partition job
DataPartitionerRemoteSparkMapper dpfun = new DataPartitionerRemoteSparkMapper(mc, ii, oi, _format, _n);
DataPartitionerRemoteSparkReducer wfun = new DataPartitionerRemoteSparkReducer(fnameNew, oi, _replication);
// partition the input blocks
inRdd.flatMapToPair(dpfun).groupByKey(// group partition blocks
numRed).foreach(// write partitions to hdfs
wfun);
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
// maintain statistics
Statistics.incrementNoOfCompiledSPInst();
Statistics.incrementNoOfExecutedSPInst();
if (DMLScript.STATISTICS) {
Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
}
}
Aggregations