Search in sources :

Example 1 with CopyBlockPairFunction

use of org.apache.sysml.runtime.instructions.spark.functions.CopyBlockPairFunction in project incubator-systemml by apache.

the class ResultMergeRemoteSpark method executeMerge.

@SuppressWarnings("unchecked")
protected RDDObject executeMerge(MatrixObject compare, MatrixObject[] inputs, String varname, long rlen, long clen, int brlen, int bclen) throws DMLRuntimeException {
    String jobname = "ParFor-RMSP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    SparkExecutionContext sec = (SparkExecutionContext) _ec;
    boolean withCompare = (compare != null);
    RDDObject ret = null;
    //determine degree of parallelism
    int numRed = (int) determineNumReducers(rlen, clen, brlen, bclen, _numReducers);
    //sanity check for empty src files
    if (inputs == null || inputs.length == 0)
        throw new DMLRuntimeException("Execute merge should never be called with no inputs.");
    try {
        //note: initial implementation via union over all result rdds discarded due to 
        //stack overflow errors with many parfor tasks, and thus many rdds
        //Step 1: construct input rdd from all result files of parfor workers
        //a) construct job conf with all files
        InputInfo ii = InputInfo.BinaryBlockInputInfo;
        JobConf job = new JobConf(ResultMergeRemoteMR.class);
        job.setJobName(jobname);
        job.setInputFormat(ii.inputFormatClass);
        Path[] paths = new Path[inputs.length];
        for (int i = 0; i < paths.length; i++) {
            //ensure input exists on hdfs (e.g., if in-memory or RDD)
            inputs[i].exportData();
            paths[i] = new Path(inputs[i].getFileName());
            //update rdd handle to allow lazy evaluation by guarding 
            //against cleanup of temporary result files
            setRDDHandleForMerge(inputs[i], sec);
        }
        FileInputFormat.setInputPaths(job, paths);
        //b) create rdd from input files w/ deep copy of keys and blocks
        JavaPairRDD<MatrixIndexes, MatrixBlock> rdd = sec.getSparkContext().hadoopRDD(job, ii.inputFormatClass, ii.inputKeyClass, ii.inputValueClass).mapPartitionsToPair(new CopyBlockPairFunction(true), true);
        //Step 2a: merge with compare
        JavaPairRDD<MatrixIndexes, MatrixBlock> out = null;
        if (withCompare) {
            JavaPairRDD<MatrixIndexes, MatrixBlock> compareRdd = (JavaPairRDD<MatrixIndexes, MatrixBlock>) sec.getRDDHandleForMatrixObject(compare, InputInfo.BinaryBlockInputInfo);
            //merge values which differ from compare values
            ResultMergeRemoteSparkWCompare cfun = new ResultMergeRemoteSparkWCompare();
            out = //group all result blocks per key
            rdd.groupByKey(numRed).join(//join compare block and result blocks 
            compareRdd).mapToPair(//merge result blocks w/ compare
            cfun);
        } else //Step 2b: merge without compare
        {
            //direct merge in any order (disjointness guaranteed)
            out = RDDAggregateUtils.mergeByKey(rdd, false);
        }
        //Step 3: create output rdd handle w/ lineage
        ret = new RDDObject(out, varname);
        for (int i = 0; i < paths.length; i++) ret.addLineageChild(inputs[i].getRDDHandle());
        if (withCompare)
            ret.addLineageChild(compare.getRDDHandle());
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    //maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS) {
        Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    }
    return ret;
}
Also used : Path(org.apache.hadoop.fs.Path) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) CopyBlockPairFunction(org.apache.sysml.runtime.instructions.spark.functions.CopyBlockPairFunction) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) InputInfo(org.apache.sysml.runtime.matrix.data.InputInfo) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) RDDObject(org.apache.sysml.runtime.instructions.spark.data.RDDObject) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JobConf(org.apache.hadoop.mapred.JobConf)

Aggregations

Path (org.apache.hadoop.fs.Path)1 JobConf (org.apache.hadoop.mapred.JobConf)1 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)1 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)1 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)1 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)1 CopyBlockPairFunction (org.apache.sysml.runtime.instructions.spark.functions.CopyBlockPairFunction)1 InputInfo (org.apache.sysml.runtime.matrix.data.InputInfo)1 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)1 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)1