use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.
the class RemoteDPParForSpark method runJob.
public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program, HashMap<String, byte[]> clsMap, String resultFile, MatrixObject input, ExecutionContext ec, PartitionFormat dpf, OutputInfo oi, boolean tSparseCol, boolean enableCPCaching, int numReducers) {
String jobname = "ParFor-DPESP";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
SparkExecutionContext sec = (SparkExecutionContext) ec;
JavaSparkContext sc = sec.getSparkContext();
// prepare input parameters
MatrixObject mo = sec.getMatrixObject(matrixvar);
MatrixCharacteristics mc = mo.getMatrixCharacteristics();
// initialize accumulators for tasks/iterations, and inputs
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
LongAccumulator aTasks = sc.sc().longAccumulator("tasks");
LongAccumulator aIters = sc.sc().longAccumulator("iterations");
// compute number of reducers (to avoid OOMs and reduce memory pressure)
int numParts = SparkUtils.getNumPreferredPartitions(mc, in);
int numReducers2 = Math.max(numReducers, Math.min(numParts, (int) dpf.getNumParts(mc)));
// core parfor datapartition-execute (w/ or w/o shuffle, depending on data characteristics)
RemoteDPParForSparkWorker efun = new RemoteDPParForSparkWorker(program, clsMap, matrixvar, itervar, enableCPCaching, mc, tSparseCol, dpf, oi, aTasks, aIters);
JavaPairRDD<Long, Writable> tmp = getPartitionedInput(sec, matrixvar, oi, dpf);
List<Tuple2<Long, String>> out = (requiresGrouping(dpf, mo) ? tmp.groupByKey(numReducers2) : tmp.map(new PseudoGrouping())).mapPartitionsToPair(// execute parfor tasks, incl cleanup
efun).collect();
// de-serialize results
LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG);
// get accumulator value
int numTasks = aTasks.value().intValue();
// get accumulator value
int numIters = aIters.value().intValue();
// create output symbol table entries
RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results);
// maintain statistics
Statistics.incrementNoOfCompiledSPInst();
Statistics.incrementNoOfExecutedSPInst();
if (DMLScript.STATISTICS) {
Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
}
return ret;
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.
the class RemoteParForSpark method runJob.
public static RemoteParForJobReturn runJob(long pfid, String prog, HashMap<String, byte[]> clsMap, List<Task> tasks, ExecutionContext ec, boolean cpCaching, int numMappers) {
String jobname = "ParFor-ESP";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
SparkExecutionContext sec = (SparkExecutionContext) ec;
JavaSparkContext sc = sec.getSparkContext();
// initialize accumulators for tasks/iterations
LongAccumulator aTasks = sc.sc().longAccumulator("tasks");
LongAccumulator aIters = sc.sc().longAccumulator("iterations");
// reset cached shared inputs for correctness in local mode
long jobid = _jobID.getNextID();
if (InfrastructureAnalyzer.isLocalMode())
RemoteParForSparkWorker.cleanupCachedVariables(jobid);
// run remote_spark parfor job
// (w/o lazy evaluation to fit existing parfor framework, e.g., result merge)
List<Tuple2<Long, String>> out = // create rdd of parfor tasks
sc.parallelize(tasks, tasks.size()).flatMapToPair(new RemoteParForSparkWorker(jobid, prog, clsMap, cpCaching, aTasks, aIters)).collect();
// de-serialize results
LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG);
// get accumulator value
int numTasks = aTasks.value().intValue();
// get accumulator value
int numIters = aIters.value().intValue();
// create output symbol table entries
RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results);
// maintain statistics
Statistics.incrementNoOfCompiledSPInst();
Statistics.incrementNoOfExecutedSPInst();
if (DMLScript.STATISTICS)
Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
return ret;
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.
the class ResultMergeRemoteSpark method executeMerge.
@SuppressWarnings("unchecked")
protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, long rlen, long clen, int brlen, int bclen) {
String jobname = "ParFor-RMSP";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
SparkExecutionContext sec = (SparkExecutionContext) _ec;
boolean withCompare = (compare != null);
RDDObject ret = null;
// determine degree of parallelism
int numRed = (int) determineNumReducers(rlen, clen, brlen, bclen, _numReducers);
// sanity check for empty src files
if (inputs == null || inputs.length == 0)
throw new DMLRuntimeException("Execute merge should never be called with no inputs.");
try {
// note: initial implementation via union over all result rdds discarded due to
// stack overflow errors with many parfor tasks, and thus many rdds
// Step 1: construct input rdd from all result files of parfor workers
// a) construct job conf with all files
InputInfo ii = InputInfo.BinaryBlockInputInfo;
JobConf job = new JobConf(ResultMergeRemoteMR.class);
job.setJobName(jobname);
job.setInputFormat(ii.inputFormatClass);
Path[] paths = new Path[inputs.length];
for (int i = 0; i < paths.length; i++) {
// ensure input exists on hdfs (e.g., if in-memory or RDD)
inputs[i].exportData();
paths[i] = new Path(inputs[i].getFileName());
// update rdd handle to allow lazy evaluation by guarding
// against cleanup of temporary result files
setRDDHandleForMerge(inputs[i], sec);
}
FileInputFormat.setInputPaths(job, paths);
// b) create rdd from input files w/ deep copy of keys and blocks
JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext().hadoopRDD(job, ii.inputFormatClass, ii.inputKeyClass, ii.inputValueClass).mapPartitionsToPair(new CopyBlockPairFunction(true), true);
// Step 2a: merge with compare
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
if (withCompare) {
JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(compare, InputInfo.BinaryBlockInputInfo);
// merge values which differ from compare values
ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare(_isAccum);
out = // group all result blocks per key
rdd.groupByKey(numRed).join(// join compare block and result blocks
compareRdd).mapToPair(// merge result blocks w/ compare
cfun);
} else // Step 2b: merge without compare
{
// direct merge in any order (disjointness guaranteed)
out = _isAccum ? RDDAggregateUtils.sumByKeyStable(rdd, false) : RDDAggregateUtils.mergeByKey(rdd, false);
}
// Step 3: create output rdd handle w/ lineage
ret = new RDDObject(out);
for (int i = 0; i < paths.length; i++) ret.addLineageChild(inputs[i].getRDDHandle());
if (withCompare)
ret.addLineageChild(compare.getRDDHandle());
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
// maintain statistics
Statistics.incrementNoOfCompiledSPInst();
Statistics.incrementNoOfExecutedSPInst();
if (DMLScript.STATISTICS) {
Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
}
return ret;
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.
the class AggregateUnarySPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
SparkExecutionContext sec = (SparkExecutionContext) ec;
MatrixCharacteristics mc = sec.getMatrixCharacteristics(input1.getName());
// get input
JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = in;
// filter input blocks for trace
if (getOpcode().equalsIgnoreCase("uaktrace"))
out = out.filter(new FilterDiagBlocksFunction());
// execute unary aggregate operation
AggregateUnaryOperator auop = (AggregateUnaryOperator) _optr;
AggregateOperator aggop = _aop;
// perform aggregation if necessary and put output into symbol table
if (_aggtype == SparkAggType.SINGLE_BLOCK) {
JavaRDD<MatrixBlock> out2 = out.map(new RDDUAggFunction2(auop, mc.getRowsPerBlock(), mc.getColsPerBlock()));
MatrixBlock out3 = RDDAggregateUtils.aggStable(out2, aggop);
// drop correction after aggregation
out3.dropLastRowsOrColumns(aggop.correctionLocation);
// put output block into symbol table (no lineage because single block)
// this also includes implicit maintenance of matrix characteristics
sec.setMatrixOutput(output.getName(), out3, getExtendedOpcode());
} else // MULTI_BLOCK or NONE
{
if (_aggtype == SparkAggType.NONE) {
// in case of no block aggregation, we always drop the correction as well as
// use a partitioning-preserving mapvalues
out = out.mapValues(new RDDUAggValueFunction(auop, mc.getRowsPerBlock(), mc.getColsPerBlock()));
} else if (_aggtype == SparkAggType.MULTI_BLOCK) {
// in case of multi-block aggregation, we always keep the correction
out = out.mapToPair(new RDDUAggFunction(auop, mc.getRowsPerBlock(), mc.getColsPerBlock()));
out = RDDAggregateUtils.aggByKeyStable(out, aggop, false);
// partitioning, drop correction via partitioning-preserving mapvalues)
if (auop.aggOp.correctionExists)
out = out.mapValues(new AggregateDropCorrectionFunction(aggop));
}
// put output RDD handle into symbol table
updateUnaryAggOutputMatrixCharacteristics(sec, auop.indexFn);
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
}
}
use of org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext in project incubator-systemml by apache.
the class AppendGAlignedSPInstruction method processInstruction.
@Override
public void processInstruction(ExecutionContext ec) {
// general case append (map-extend, aggregate)
SparkExecutionContext sec = (SparkExecutionContext) ec;
checkBinaryAppendInputCharacteristics(sec, _cbind, false, true);
MatrixCharacteristics mc1 = sec.getMatrixCharacteristics(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> in1 = sec.getBinaryBlockRDDHandleForVariable(input1.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> in2 = sec.getBinaryBlockRDDHandleForVariable(input2.getName());
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
// Simple changing of matrix indexes of RHS
long shiftBy = _cbind ? mc1.getNumColBlocks() : mc1.getNumRowBlocks();
out = in2.mapToPair(new ShiftColumnIndex(shiftBy, _cbind));
out = in1.union(out);
// put output RDD handle into symbol table
updateBinaryAppendOutputMatrixCharacteristics(sec, _cbind);
sec.setRDDHandleForVariable(output.getName(), out);
sec.addLineageRDD(output.getName(), input1.getName());
sec.addLineageRDD(output.getName(), input2.getName());
}
Aggregations