Search in sources :

Example 36 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.

the class ParForProgramBlock method consolidateAndCheckResults.

private void consolidateAndCheckResults(ExecutionContext ec, long expIters, long expTasks, long numIters, long numTasks, LocalVariableMap[] results) {
    Timing time = new Timing(true);
    // result merge
    if (checkParallelRemoteResultMerge()) {
        // execute result merge in parallel for all result vars
        int par = Math.min(_resultVars.size(), InfrastructureAnalyzer.getLocalParallelism());
        if (InfrastructureAnalyzer.isLocalMode()) {
            int parmem = (int) Math.floor(OptimizerUtils.getLocalMemBudget() / InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer());
            // reduce k if necessary
            par = Math.min(par, Math.max(parmem, 1));
        }
        try {
            // enqueue all result vars as tasks
            LocalTaskQueue<ResultVar> q = new LocalTaskQueue<>();
            for (ResultVar var : _resultVars) {
                // foreach non-local write
                if (// robustness scalars
                ec.getVariable(var._name) instanceof MatrixObject)
                    q.enqueueTask(var);
            }
            q.closeInput();
            // run result merge workers
            ResultMergeWorker[] rmWorkers = new ResultMergeWorker[par];
            for (int i = 0; i < par; i++) rmWorkers[i] = new ResultMergeWorker(q, results, ec);
            for (// start all
            int i = 0; // start all
            i < par; // start all
            i++) rmWorkers[i].start();
            for (int i = 0; i < par; i++) {
                // wait for all
                rmWorkers[i].join();
                if (!rmWorkers[i].finishedNoError())
                    throw new DMLRuntimeException("Error occured in parallel result merge worker.");
            }
        } catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
    } else {
        // execute result merge sequentially for all result vars
        for (// foreach non-local write
        ResultVar var : // foreach non-local write
        _resultVars) {
            Data dat = ec.getVariable(var._name);
            if (// robustness scalars
            dat instanceof MatrixObject) {
                MatrixObject out = (MatrixObject) dat;
                MatrixObject[] in = new MatrixObject[results.length];
                for (int i = 0; i < results.length; i++) in[i] = (MatrixObject) results[i].get(var._name);
                String fname = constructResultMergeFileName();
                ResultMerge rm = createResultMerge(_resultMerge, out, in, fname, var._isAccum, ec);
                MatrixObject outNew = null;
                if (USE_PARALLEL_RESULT_MERGE)
                    outNew = rm.executeParallelMerge(_numThreads);
                else
                    outNew = rm.executeSerialMerge();
                // cleanup existing var
                Data exdata = ec.removeVariable(var._name);
                if (exdata != null && exdata != outNew && exdata instanceof MatrixObject)
                    ec.cleanupCacheableData((MatrixObject) exdata);
                // cleanup of intermediate result variables
                cleanWorkerResultVariables(ec, out, in);
                // set merged result variable
                ec.setVariable(var._name, outNew);
            }
        }
    }
    // handle unscoped variables (vars created in parfor, but potentially used afterwards)
    ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
    if (// sb might be null for nested parallelism
    CREATE_UNSCOPED_RESULTVARS && sb != null && ec.getVariables() != null)
        createEmptyUnscopedVariables(ec.getVariables(), sb);
    // check expected counters
    if (// consistency check
    numTasks != expTasks || numIters != expIters)
        throw new DMLRuntimeException("PARFOR: Number of executed tasks does not match the number of created tasks: tasks " + numTasks + "/" + expTasks + ", iters " + numIters + "/" + expIters + ".");
    if (DMLScript.STATISTICS)
        Statistics.incrementParForMergeTime((long) time.stop());
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Data(org.apache.sysml.runtime.instructions.cp.Data) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) LocalTaskQueue(org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue) ResultVar(org.apache.sysml.parser.ParForStatementBlock.ResultVar) ResultMerge(org.apache.sysml.runtime.controlprogram.parfor.ResultMerge) ParForStatementBlock(org.apache.sysml.parser.ParForStatementBlock) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)

Example 37 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.

the class ParForProgramBlock method execute.

@Override
public void execute(ExecutionContext ec) {
    ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
    // evaluate from, to, incr only once (assumption: known at for entry)
    IntObject from = executePredicateInstructions(1, _fromInstructions, ec);
    IntObject to = executePredicateInstructions(2, _toInstructions, ec);
    IntObject incr = (_incrementInstructions == null || _incrementInstructions.isEmpty()) ? new IntObject((from.getLongValue() <= to.getLongValue()) ? 1 : -1) : executePredicateInstructions(3, _incrementInstructions, ec);
    if (// would produce infinite loop
    incr.getLongValue() == 0)
        throw new DMLRuntimeException(this.printBlockErrorLocation() + "Expression for increment " + "of variable '" + _iterPredVar + "' must evaluate to a non-zero value.");
    // early exit on num iterations = zero
    _numIterations = computeNumIterations(from, to, incr);
    if (_numIterations <= 0)
        // avoid unnecessary optimization/initialization
        return;
    // /////
    if (_optMode != POptMode.NONE) {
        // set optimizer log level
        OptimizationWrapper.setLogLevel(_optLogLevel);
        // core optimize
        OptimizationWrapper.optimize(_optMode, sb, this, ec, _monitor);
    }
    // /////
    // DATA PARTITIONING of read-only parent variables of type (matrix,unpartitioned)
    // /////
    Timing time = _monitor ? new Timing(true) : null;
    // partitioning on demand (note: for fused data partitioning and execute the optimizer set
    // the data partitioner to NONE in order to prevent any side effects)
    handleDataPartitioning(ec);
    // repartitioning of variables for spark cpmm/zipmm in order prevent unnecessary shuffle
    handleSparkRepartitioning(ec);
    // eager rdd caching of variables for spark in order prevent read/write contention
    handleSparkEagerCaching(ec);
    if (_monitor)
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_DATA_T, time.stop());
    // initialize iter var to form value
    IntObject iterVar = new IntObject(from.getLongValue());
    // /////
    // begin PARALLEL EXECUTION of (PAR)FOR body
    // /////
    LOG.trace("EXECUTE PARFOR ID = " + _ID + " with mode = " + _execMode + ", numThreads = " + _numThreads + ", taskpartitioner = " + _taskPartitioner);
    if (_monitor) {
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTHREADS, _numThreads);
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_TASKSIZE, _taskSize);
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_TASKPARTITIONER, _taskPartitioner.ordinal());
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_DATAPARTITIONER, _dataPartitioner.ordinal());
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_EXECMODE, _execMode.ordinal());
    }
    // preserve shared input/result variables of cleanup
    ArrayList<String> varList = ec.getVarList();
    boolean[] varState = ec.pinVariables(varList);
    try {
        switch(_execMode) {
            case // create parworkers as local threads
            LOCAL:
                executeLocalParFor(ec, iterVar, from, to, incr);
                break;
            case // create parworkers as MR tasks (one job per parfor)
            REMOTE_MR:
                executeRemoteMRParFor(ec, iterVar, from, to, incr);
                break;
            case // create parworkers as MR tasks (one job per parfor)
            REMOTE_MR_DP:
                executeRemoteMRParForDP(ec, iterVar, from, to, incr);
                break;
            case // create parworkers as Spark tasks (one job per parfor)
            REMOTE_SPARK:
                executeRemoteSparkParFor(ec, iterVar, from, to, incr);
                break;
            case // create parworkers as Spark tasks (one job per parfor)
            REMOTE_SPARK_DP:
                executeRemoteSparkParForDP(ec, iterVar, from, to, incr);
                break;
            default:
                throw new DMLRuntimeException("Undefined execution mode: '" + _execMode + "'.");
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException("PARFOR: Failed to execute loop in parallel.", ex);
    }
    // reset state of shared input/result variables
    ec.unpinVariables(varList, varState);
    // cleanup unpinned shared variables
    cleanupSharedVariables(ec, varState);
    // set iteration var to TO value (+ increment) for FOR equivalence
    // consistent with for
    iterVar = new IntObject(to.getLongValue());
    ec.setVariable(_iterPredVar, iterVar);
    // we can replace those variables, because partitioning only applied for read-only matrices
    for (String var : _variablesDPOriginal.keySet()) {
        // cleanup partitioned matrix (if not reused)
        if (!_variablesDPReuse.keySet().contains(var))
            VariableCPInstruction.processRemoveVariableInstruction(ec, var);
        // reset to original matrix
        MatrixObject mo = (MatrixObject) _variablesDPOriginal.get(var);
        ec.setVariable(var, mo);
    }
    // print profiling report (only if top-level parfor because otherwise in parallel context)
    if (_monitorReport)
        LOG.info("\n" + StatisticMonitor.createReport());
    // TODO reset of hop parallelism constraint (e.g., ba+*)
    for (// release forced exectypes
    String dpvar : // release forced exectypes
    _variablesDPOriginal.keySet()) ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, dpvar, ec, false);
    // release forced exectypes for fused dp/exec
    if (_execMode == PExecMode.REMOTE_MR_DP || _execMode == PExecMode.REMOTE_SPARK_DP)
        ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, _colocatedDPMatrix, ec, false);
    // after release, deletes dp_varnames
    resetOptimizerFlags();
    // execute exit instructions (usually empty)
    executeInstructions(_exitInstructions, ec);
}
Also used : IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ParForStatementBlock(org.apache.sysml.parser.ParForStatementBlock) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 38 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.

the class ParForProgramBlock method handleDataPartitioning.

private void handleDataPartitioning(ExecutionContext ec) {
    PDataPartitioner dataPartitioner = _dataPartitioner;
    if (dataPartitioner != PDataPartitioner.NONE) {
        ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
        if (sb == null)
            throw new DMLRuntimeException("ParFor statement block required for reasoning about data partitioning.");
        for (String var : sb.getReadOnlyParentVars()) {
            Data dat = ec.getVariable(var);
            // partitioning but typically related branches are never executed)
            if (dat != null && dat instanceof MatrixObject) {
                // unpartitioned input
                MatrixObject moVar = (MatrixObject) dat;
                PartitionFormat dpf = sb.determineDataPartitionFormat(var);
                LOG.trace("PARFOR ID = " + _ID + ", Partitioning read-only input variable " + var + " (format=" + dpf + ", mode=" + _dataPartitioner + ")");
                if (dpf != PartitionFormat.NONE) {
                    if (dataPartitioner != PDataPartitioner.REMOTE_SPARK && dpf.isBlockwise()) {
                        LOG.warn("PARFOR ID = " + _ID + ", Switching data partitioner from " + dataPartitioner + " to " + PDataPartitioner.REMOTE_SPARK.name() + " for blockwise-n partitioning.");
                        dataPartitioner = PDataPartitioner.REMOTE_SPARK;
                    }
                    Timing ltime = new Timing(true);
                    // input data partitioning (reuse if possible)
                    Data dpdatNew = _variablesDPReuse.get(var);
                    if (// no reuse opportunity
                    dpdatNew == null) {
                        DataPartitioner dp = createDataPartitioner(dpf, dataPartitioner, ec);
                        // disable binary cell for sparse if consumed by MR jobs
                        if (!OptimizerRuleBased.allowsBinaryCellPartitions(moVar, dpf) || // TODO support for binarycell
                        OptimizerUtils.isSparkExecutionMode()) {
                            dp.disableBinaryCell();
                        }
                        MatrixObject moVarNew = dp.createPartitionedMatrixObject(moVar, constructDataPartitionsFileName());
                        dpdatNew = moVarNew;
                        // skip remaining partitioning logic if not partitioned (e.g., too small)
                        if (moVar == moVarNew)
                            // skip to next
                            continue;
                    }
                    ec.setVariable(var, dpdatNew);
                    // recompile parfor body program
                    ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, var, ec, true);
                    // store original and partitioned matrix (for reuse if applicable)
                    _variablesDPOriginal.put(var, moVar);
                    if (ALLOW_REUSE_PARTITION_VARS && ProgramRecompiler.isApplicableForReuseVariable(sb.getDMLProg(), sb, var)) {
                        _variablesDPReuse.put(var, dpdatNew);
                    }
                    LOG.trace("Partitioning and recompilation done in " + ltime.stop() + "ms");
                }
            }
        }
    }
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) DataPartitioner(org.apache.sysml.runtime.controlprogram.parfor.DataPartitioner) ParForStatementBlock(org.apache.sysml.parser.ParForStatementBlock) Data(org.apache.sysml.runtime.instructions.cp.Data) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 39 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.

the class LocalParWorker method run.

@Override
public void run() {
    // monitoring start
    Timing time1 = (_monitor ? new Timing(true) : null);
    // spark context creation (if data cached already created)
    if (OptimizerUtils.isSparkExecutionMode() && SparkExecutionContext.isSparkContextCreated()) {
        SparkExecutionContext sec = (SparkExecutionContext) _ec;
        sec.setThreadLocalSchedulerPool("parforPool" + _workerID);
    }
    // Initialize this GPUContext to this thread
    if (DMLScript.USE_ACCELERATOR) {
        try {
            _ec.getGPUContext(0).initializeThread();
        } catch (DMLRuntimeException e) {
            LOG.error("Error executing task because of failure in GPU backend: ", e);
            LOG.error("Stopping LocalParWorker.");
            return;
        }
    }
    // setup compiler config for worker thread
    ConfigurationManager.setLocalConfig(_cconf);
    // continuous execution (execute tasks until (1) stopped or (2) no more tasks)
    Task lTask = null;
    while (!_stopped) {
        // dequeue the next task (abort on NO_MORE_TASKS or error)
        try {
            lTask = _taskQueue.dequeueTask();
            if (// task queue closed (no more tasks)
            lTask == LocalTaskQueue.NO_MORE_TASKS)
                // normal end of parallel worker
                break;
        } catch (Exception ex) {
            // abort on taskqueue error
            LOG.warn("Error reading from task queue: " + ex.getMessage());
            LOG.warn("Stopping LocalParWorker.");
            // no exception thrown to prevent blocking on join
            break;
        }
        // execute the task sequentially (re-try on error)
        boolean success = false;
        int retrys = _max_retry;
        while (!success) {
            try {
                // /////
                // core execution (see ParWorker)
                executeTask(lTask);
                success = true;
            } catch (Exception ex) {
                LOG.error("Failed to execute " + lTask.toString() + ", retry:" + retrys, ex);
                if (retrys > 0)
                    // retry on task error
                    retrys--;
                else {
                    // abort on no remaining retrys
                    LOG.error("Error executing task: ", ex);
                    LOG.error("Stopping LocalParWorker.");
                    // no exception thrown to prevent blocking on join
                    break;
                }
            }
        }
    }
    // setup fair scheduler pool for worker thread
    if (OptimizerUtils.isSparkExecutionMode() && SparkExecutionContext.isSparkContextCreated()) {
        SparkExecutionContext sec = (SparkExecutionContext) _ec;
        sec.cleanupThreadLocalSchedulerPool();
    }
    if (_monitor) {
        StatisticMonitor.putPWStat(_workerID, Stat.PARWRK_NUMTASKS, _numTasks);
        StatisticMonitor.putPWStat(_workerID, Stat.PARWRK_NUMITERS, _numIters);
        StatisticMonitor.putPWStat(_workerID, Stat.PARWRK_EXEC_T, time1.stop());
    }
}
Also used : Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 40 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project systemml by apache.

the class CompressedMatrixBlock method aggregateBinaryOperations.

@Override
public MatrixBlock aggregateBinaryOperations(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, AggregateBinaryOperator op) {
    // call uncompressed matrix mult if necessary
    if (!isCompressed()) {
        return super.aggregateBinaryOperations(m1, m2, ret, op);
    }
    // multi-threaded mm of single uncompressed colgroup
    if (isSingleUncompressedGroup()) {
        MatrixBlock tmp = ((ColGroupUncompressed) _colGroups.get(0)).getData();
        return tmp.aggregateBinaryOperations(this == m1 ? tmp : m1, this == m2 ? tmp : m2, ret, op);
    }
    Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
    // setup meta data (dimensions, sparsity)
    int rl = m1.getNumRows();
    int cl = m2.getNumColumns();
    // create output matrix block
    if (ret == null)
        ret = new MatrixBlock(rl, cl, false, rl * cl);
    else
        ret.reset(rl, cl, false, rl * cl);
    // compute matrix mult
    if (m1.getNumRows() > 1 && m2.getNumColumns() == 1) {
        // MV right
        CompressedMatrixBlock cmb = (CompressedMatrixBlock) m1;
        if (op.getNumThreads() > 1)
            cmb.rightMultByVector(m2, ret, op.getNumThreads());
        else
            cmb.rightMultByVector(m2, ret);
    } else if (m1.getNumRows() == 1 && m2.getNumColumns() > 1) {
        // MV left
        if (op.getNumThreads() > 1)
            leftMultByVectorTranspose(_colGroups, m1, ret, false, op.getNumThreads());
        else
            leftMultByVectorTranspose(_colGroups, m1, ret, false, true);
    } else {
        // MM
        // prepare the other input (including decompression if necessary)
        boolean right = (m1 == this);
        MatrixBlock that = right ? m2 : m1;
        if (that instanceof CompressedMatrixBlock) {
            that = ((CompressedMatrixBlock) that).isCompressed() ? ((CompressedMatrixBlock) that).decompress() : that;
        }
        // transpose for sequential repeated column access
        if (right) {
            that = LibMatrixReorg.transpose(that, new MatrixBlock(that.getNumColumns(), that.getNumRows(), that.isInSparseFormat()), op.getNumThreads());
        }
        MatrixBlock tmpIn = new MatrixBlock(1, that.getNumColumns(), false).allocateBlock();
        MatrixBlock tmpOut = new MatrixBlock(right ? rl : 1, right ? 1 : cl, false).allocateBlock();
        if (right) {
            // MM right
            for (int i = 0; i < that.getNumRows(); i++) {
                // on transpose
                tmpIn = that.slice(i, i, 0, that.getNumColumns() - 1, tmpIn);
                MatrixBlock tmpIn2 = // meta data op
                LibMatrixReorg.transpose(// meta data op
                tmpIn, new MatrixBlock(tmpIn.getNumColumns(), tmpIn.getNumRows(), false));
                tmpOut.reset(tmpOut.getNumRows(), tmpOut.getNumColumns());
                if (op.getNumThreads() > 1)
                    rightMultByVector(tmpIn2, tmpOut, op.getNumThreads());
                else
                    rightMultByVector(tmpIn2, tmpOut);
                ret.leftIndexingOperations(tmpOut, 0, ret.getNumRows() - 1, i, i, ret, UpdateType.INPLACE);
            }
        } else {
            // MM left
            for (int i = 0; i < that.getNumRows(); i++) {
                tmpIn = that.slice(i, i, 0, that.getNumColumns() - 1, tmpIn);
                if (op.getNumThreads() > 1)
                    leftMultByVectorTranspose(_colGroups, tmpIn, tmpOut, false, op.getNumThreads());
                else
                    leftMultByVectorTranspose(_colGroups, tmpIn, tmpOut, false, true);
                ret.leftIndexingOperations(tmpOut, i, i, 0, ret.getNumColumns() - 1, ret, UpdateType.INPLACE);
            }
        }
    }
    if (LOG.isDebugEnabled())
        LOG.debug("Compressed MM in " + time.stop());
    return ret;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)

Aggregations

Timing (org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)78 IOException (java.io.IOException)31 ArrayList (java.util.ArrayList)29 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)25 HashMap (java.util.HashMap)24 Connection (org.apache.sysml.api.jmlc.Connection)17 PreparedScript (org.apache.sysml.api.jmlc.PreparedScript)17 ResultVariables (org.apache.sysml.api.jmlc.ResultVariables)17 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)17 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)14 ParForStatementBlock (org.apache.sysml.parser.ParForStatementBlock)10 TaskPartitioner (org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)10 ParForBody (org.apache.sysml.runtime.controlprogram.parfor.ParForBody)8 RemoteParForJobReturn (org.apache.sysml.runtime.controlprogram.parfor.RemoteParForJobReturn)8 LocalVariableMap (org.apache.sysml.runtime.controlprogram.LocalVariableMap)7 ProgramBlock (org.apache.sysml.runtime.controlprogram.ProgramBlock)7 ExecutorService (java.util.concurrent.ExecutorService)6 Future (java.util.concurrent.Future)6 LocalTaskQueue (org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue)6 Task (org.apache.sysml.runtime.controlprogram.parfor.Task)6