use of org.apache.sysml.runtime.instructions.spark.functions.CopyBlockPairFunction in project incubator-systemml by apache.
the class ResultMergeRemoteSpark method executeMerge.
@SuppressWarnings("unchecked")
protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, String varname, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
String jobname = "ParFor-RMSP";
long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
SparkExecutionContext sec = (SparkExecutionContext) _ec;
boolean withCompare = (compare != null);
RDDObject ret = null;
//determine degree of parallelism
int numRed = (int) determineNumReducers(rlen, clen, brlen, bclen, _numReducers);
//sanity check for empty src files
if (inputs == null || inputs.length == 0)
throw new DMLRuntimeException("Execute merge should never be called with no inputs.");
try {
//note: initial implementation via union over all result rdds discarded due to
//stack overflow errors with many parfor tasks, and thus many rdds
//Step 1: construct input rdd from all result files of parfor workers
//a) construct job conf with all files
InputInfo ii = InputInfo.BinaryBlockInputInfo;
JobConf job = new JobConf(ResultMergeRemoteMR.class);
job.setJobName(jobname);
job.setInputFormat(ii.inputFormatClass);
Path[] paths = new Path[inputs.length];
for (int i = 0; i < paths.length; i++) {
//ensure input exists on hdfs (e.g., if in-memory or RDD)
inputs[i].exportData();
paths[i] = new Path(inputs[i].getFileName());
//update rdd handle to allow lazy evaluation by guarding
//against cleanup of temporary result files
setRDDHandleForMerge(inputs[i], sec);
}
FileInputFormat.setInputPaths(job, paths);
//b) create rdd from input files w/ deep copy of keys and blocks
JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext().hadoopRDD(job, ii.inputFormatClass, ii.inputKeyClass, ii.inputValueClass).mapPartitionsToPair(new CopyBlockPairFunction(true), true);
//Step 2a: merge with compare
JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
if (withCompare) {
JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(compare, InputInfo.BinaryBlockInputInfo);
//merge values which differ from compare values
ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare();
out = //group all result blocks per key
rdd.groupByKey(numRed).join(//join compare block and result blocks
compareRdd).mapToPair(//merge result blocks w/ compare
cfun);
} else //Step 2b: merge without compare
{
//direct merge in any order (disjointness guaranteed)
out = RDDAggregateUtils.mergeByKey(rdd, false);
}
//Step 3: create output rdd handle w/ lineage
ret = new RDDObject(out, varname);
for (int i = 0; i < paths.length; i++) ret.addLineageChild(inputs[i].getRDDHandle());
if (withCompare)
ret.addLineageChild(compare.getRDDHandle());
} catch (Exception ex) {
throw new DMLRuntimeException(ex);
}
//maintain statistics
Statistics.incrementNoOfCompiledSPInst();
Statistics.incrementNoOfExecutedSPInst();
if (DMLScript.STATISTICS) {
Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
}
return ret;
}
Aggregations