Search in sources :

Example 6 with LongAccumulator

use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.

the class RemoteDPParForSpark method runJob.

public static RemoteParForJobReturn runJob(long pfid, String itervar, String matrixvar, String program, HashMap<String, byte[]> clsMap, String resultFile, MatrixObject input, ExecutionContext ec, PartitionFormat dpf, OutputInfo oi, boolean tSparseCol, boolean enableCPCaching, int numReducers) {
    String jobname = "ParFor-DPESP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    JavaSparkContext sc = sec.getSparkContext();
    // prepare input parameters
    MatrixObject mo = sec.getMatrixObject(matrixvar);
    MatrixCharacteristics mc = mo.getMatrixCharacteristics();
    // initialize accumulators for tasks/iterations, and inputs
    JavaPairRDD<MatrixIndexes, MatrixBlock> in = sec.getBinaryBlockRDDHandleForVariable(matrixvar);
    LongAccumulator aTasks = sc.sc().longAccumulator("tasks");
    LongAccumulator aIters = sc.sc().longAccumulator("iterations");
    // compute number of reducers (to avoid OOMs and reduce memory pressure)
    int numParts = SparkUtils.getNumPreferredPartitions(mc, in);
    int numReducers2 = Math.max(numReducers, Math.min(numParts, (int) dpf.getNumParts(mc)));
    // core parfor datapartition-execute (w/ or w/o shuffle, depending on data characteristics)
    RemoteDPParForSparkWorker efun = new RemoteDPParForSparkWorker(program, clsMap, matrixvar, itervar, enableCPCaching, mc, tSparseCol, dpf, oi, aTasks, aIters);
    JavaPairRDD<Long, Writable> tmp = getPartitionedInput(sec, matrixvar, oi, dpf);
    List<Tuple2<Long, String>> out = (requiresGrouping(dpf, mo) ? tmp.groupByKey(numReducers2) : tmp.map(new PseudoGrouping())).mapPartitionsToPair(// execute parfor tasks, incl cleanup
    efun).collect();
    // de-serialize results
    LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG);
    // get accumulator value
    int numTasks = aTasks.value().intValue();
    // get accumulator value
    int numIters = aIters.value().intValue();
    // create output symbol table entries
    RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results);
    // maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS) {
        Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    }
    return ret;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) MatrixIndexes(org.apache.sysml.runtime.matrix.data.MatrixIndexes) Writable(org.apache.hadoop.io.Writable) MatrixCharacteristics(org.apache.sysml.runtime.matrix.MatrixCharacteristics) LongAccumulator(org.apache.spark.util.LongAccumulator) Tuple2(scala.Tuple2) LocalVariableMap(org.apache.sysml.runtime.controlprogram.LocalVariableMap) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 7 with LongAccumulator

use of org.apache.spark.util.LongAccumulator in project incubator-systemml by apache.

the class RemoteParForSpark method runJob.

public static RemoteParForJobReturn runJob(long pfid, String prog, HashMap<String, byte[]> clsMap, List<Task> tasks, ExecutionContext ec, boolean cpCaching, int numMappers) {
    String jobname = "ParFor-ESP";
    long t0 = DMLScript.STATISTICS ? System.nanoTime() : 0;
    SparkExecutionContext sec = (SparkExecutionContext) ec;
    JavaSparkContext sc = sec.getSparkContext();
    // initialize accumulators for tasks/iterations
    LongAccumulator aTasks = sc.sc().longAccumulator("tasks");
    LongAccumulator aIters = sc.sc().longAccumulator("iterations");
    // reset cached shared inputs for correctness in local mode
    long jobid = _jobID.getNextID();
    if (InfrastructureAnalyzer.isLocalMode())
        RemoteParForSparkWorker.cleanupCachedVariables(jobid);
    // run remote_spark parfor job
    // (w/o lazy evaluation to fit existing parfor framework, e.g., result merge)
    List<Tuple2<Long, String>> out = // create rdd of parfor tasks
    sc.parallelize(tasks, tasks.size()).flatMapToPair(new RemoteParForSparkWorker(jobid, prog, clsMap, cpCaching, aTasks, aIters)).collect();
    // de-serialize results
    LocalVariableMap[] results = RemoteParForUtils.getResults(out, LOG);
    // get accumulator value
    int numTasks = aTasks.value().intValue();
    // get accumulator value
    int numIters = aIters.value().intValue();
    // create output symbol table entries
    RemoteParForJobReturn ret = new RemoteParForJobReturn(true, numTasks, numIters, results);
    // maintain statistics
    Statistics.incrementNoOfCompiledSPInst();
    Statistics.incrementNoOfExecutedSPInst();
    if (DMLScript.STATISTICS)
        Statistics.maintainCPHeavyHitters(jobname, System.nanoTime() - t0);
    return ret;
}
Also used : LongAccumulator(org.apache.spark.util.LongAccumulator) Tuple2(scala.Tuple2) LocalVariableMap(org.apache.sysml.runtime.controlprogram.LocalVariableMap) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Aggregations

LongAccumulator (org.apache.spark.util.LongAccumulator)7 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)6 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)6 LabeledPoint (org.apache.spark.ml.feature.LabeledPoint)3 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)3 IOException (java.io.IOException)2 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)2 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)2 LocalVariableMap (org.apache.sysml.runtime.controlprogram.LocalVariableMap)2 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)2 ComputeBinaryBlockNnzFunction (org.apache.sysml.runtime.instructions.spark.functions.ComputeBinaryBlockNnzFunction)2 Tuple2 (scala.Tuple2)2 ArrayList (java.util.ArrayList)1 Text (org.apache.hadoop.io.Text)1 Writable (org.apache.hadoop.io.Writable)1 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)1 JavaRDD (org.apache.spark.api.java.JavaRDD)1 Row (org.apache.spark.sql.Row)1 CompressedMatrixBlock (org.apache.sysml.runtime.compress.CompressedMatrixBlock)1 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)1