Search in sources :

Example 56 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.

the class CompressedMatrixBlock method chainMatrixMultOperations.

@Override
public MatrixBlock chainMatrixMultOperations(MatrixBlock v, MatrixBlock w, MatrixBlock out, ChainType ctype, int k) {
    // call uncompressed matrix mult if necessary
    if (!isCompressed()) {
        return super.chainMatrixMultOperations(v, w, out, ctype, k);
    }
    // multi-threaded mmchain of single uncompressed colgroup
    if (isSingleUncompressedGroup()) {
        return ((ColGroupUncompressed) _colGroups.get(0)).getData().chainMatrixMultOperations(v, w, out, ctype, k);
    }
    Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
    // prepare result
    if (out != null)
        out.reset(clen, 1, false);
    else
        out = new MatrixBlock(clen, 1, false);
    // empty block handling
    if (isEmptyBlock(false))
        return out;
    // compute matrix mult
    MatrixBlock tmp = new MatrixBlock(rlen, 1, false);
    rightMultByVector(v, tmp, k);
    if (ctype == ChainType.XtwXv) {
        BinaryOperator bop = new BinaryOperator(Multiply.getMultiplyFnObject());
        LibMatrixBincell.bincellOpInPlace(tmp, w, bop);
    }
    leftMultByVectorTranspose(_colGroups, tmp, out, true, k);
    if (LOG.isDebugEnabled())
        LOG.debug("Compressed MMChain k=" + k + " in " + time.stop());
    return out;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) AggregateBinaryOperator(org.apache.sysml.runtime.matrix.operators.AggregateBinaryOperator) BinaryOperator(org.apache.sysml.runtime.matrix.operators.BinaryOperator)

Example 57 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.

the class CompressedMatrixBlock method decompress.

/**
 * Decompress block.
 *
 * @return a new uncompressed matrix block containing the contents of this
 *         block
 */
public MatrixBlock decompress() {
    // early abort for not yet compressed blocks
    if (!isCompressed())
        return new MatrixBlock(this);
    Timing time = new Timing(true);
    // preallocation sparse rows to avoid repeated reallocations
    MatrixBlock ret = new MatrixBlock(getNumRows(), getNumColumns(), isInSparseFormat(), getNonZeros());
    if (ret.isInSparseFormat()) {
        int[] rnnz = new int[rlen];
        for (ColGroup grp : _colGroups) grp.countNonZerosPerRow(rnnz, 0, rlen);
        ret.allocateSparseRowsBlock();
        SparseBlock rows = ret.getSparseBlock();
        for (int i = 0; i < rlen; i++) rows.allocate(i, rnnz[i]);
    }
    // core decompression (append if sparse)
    for (ColGroup grp : _colGroups) grp.decompressToBlock(ret, 0, rlen);
    // post-processing (for append in decompress)
    ret.setNonZeros(nonZeros);
    if (ret.isInSparseFormat())
        ret.sortSparseRows();
    if (LOG.isDebugEnabled())
        LOG.debug("decompressed block in " + time.stop() + "ms.");
    return ret;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) SparseBlock(org.apache.sysml.runtime.matrix.data.SparseBlock)

Example 58 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.

the class ParForProgramBlock method executeRemoteMRParFor.

private void executeRemoteMRParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws IOException {
    /* Step 0) check and recompile MR inst
		 * Step 1) serialize child PB and inst
		 * Step 2) create and serialize tasks
		 * Step 3) submit MR Jobs and wait for results
		 * Step 4) collect results from each parallel worker
		 */
    Timing time = (_monitor ? new Timing(true) : null);
    // Step 0) check and compile to CP (if forced remote parfor)
    boolean flagForced = false;
    if (FORCE_CP_ON_REMOTE_MR && (_optMode == POptMode.NONE || (_optMode == POptMode.CONSTRAINED && _execMode == PExecMode.REMOTE_MR))) {
        // tid = 0  because replaced in remote parworker
        flagForced = checkMRAndRecompileToCP(0);
    }
    // Step 1) init parallel workers (serialize PBs)
    // NOTES: each mapper changes filenames with regard to his ID as we submit a single
    // job, cannot reuse serialized string, since variables are serialized as well.
    ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
    String program = ProgramConverter.serializeParForBody(body);
    if (_monitor)
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
    // Step 2) create tasks
    TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
    String taskFile = constructTaskFileName();
    String resultFile = constructResultFileName();
    long numIterations = partitioner.getNumIterations();
    int maxDigits = (int) Math.log10(to.getLongValue()) + 1;
    long numCreatedTasks = -1;
    if (USE_STREAMING_TASK_CREATION) {
        LocalTaskQueue<Task> queue = new LocalTaskQueue<>();
        // put tasks into queue and start writing to taskFile
        numCreatedTasks = partitioner.createTasks(queue);
        taskFile = writeTasksToFile(taskFile, queue, maxDigits);
    } else {
        // sequentially create tasks and write to disk
        List<Task> tasks = partitioner.createTasks();
        numCreatedTasks = tasks.size();
        taskFile = writeTasksToFile(taskFile, tasks, maxDigits);
    }
    if (_monitor)
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
    // write matrices to HDFS
    exportMatricesToHDFS(ec);
    // Step 3) submit MR job (wait for finished work)
    MatrixObject colocatedDPMatrixObj = (_colocatedDPMatrix != null) ? ec.getMatrixObject(_colocatedDPMatrix) : null;
    RemoteParForJobReturn ret = RemoteParForMR.runJob(_ID, program, taskFile, resultFile, colocatedDPMatrixObj, _enableCPCaching, _numThreads, WRITE_REPLICATION_FACTOR, MAX_RETRYS_ON_ERROR, getMinMemory(ec), (ALLOW_REUSE_MR_JVMS & _jvmReuse));
    if (_monitor)
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
    // Step 4) collecting results from each parallel worker
    int numExecutedTasks = ret.getNumExecutedTasks();
    int numExecutedIterations = ret.getNumExecutedIterations();
    // consolidate results into global symbol table
    consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
    if (// see step 0
    flagForced)
        releaseForcedRecompile(0);
    if (_monitor) {
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
    }
}
Also used : ParForBody(org.apache.sysml.runtime.controlprogram.parfor.ParForBody) Task(org.apache.sysml.runtime.controlprogram.parfor.Task) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) RemoteParForJobReturn(org.apache.sysml.runtime.controlprogram.parfor.RemoteParForJobReturn) LocalTaskQueue(org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) TaskPartitioner(org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)

Example 59 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.

the class ParForProgramBlock method executeRemoteMRParForDP.

private void executeRemoteMRParForDP(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws IOException {
    /* Step 0) check and recompile MR inst
		 * Step 1) serialize child PB and inst
		 * Step 2) create and serialize tasks
		 * Step 3) submit MR Jobs and wait for results
		 * Step 4) collect results from each parallel worker
		 */
    Timing time = (_monitor ? new Timing(true) : null);
    // Step 0) check and compile to CP (if forced remote parfor)
    boolean flagForced = checkMRAndRecompileToCP(0);
    // Step 1) prepare partitioned input matrix (needs to happen before serializing the program)
    ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
    MatrixObject inputMatrix = ec.getMatrixObject(_colocatedDPMatrix);
    PartitionFormat inputDPF = sb.determineDataPartitionFormat(_colocatedDPMatrix);
    // mark matrix var as partitioned
    inputMatrix.setPartitioned(inputDPF._dpf, inputDPF._N);
    // Step 2) init parallel workers (serialize PBs)
    // NOTES: each mapper changes filenames with regard to his ID as we submit a single
    // job, cannot reuse serialized string, since variables are serialized as well.
    ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
    String program = ProgramConverter.serializeParForBody(body);
    if (_monitor)
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
    // Step 3) create tasks
    TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
    String resultFile = constructResultFileName();
    long numIterations = partitioner.getNumIterations();
    // partitioner.createTasks().size();
    long numCreatedTasks = numIterations;
    if (_monitor)
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
    // write matrices to HDFS
    exportMatricesToHDFS(ec);
    // Step 4) submit MR job (wait for finished work)
    OutputInfo inputOI = ((inputMatrix.getSparsity() < 0.1 && inputDPF == PartitionFormat.COLUMN_WISE) || (inputMatrix.getSparsity() < 0.001 && inputDPF == PartitionFormat.ROW_WISE)) ? OutputInfo.BinaryCellOutputInfo : OutputInfo.BinaryBlockOutputInfo;
    RemoteParForJobReturn ret = RemoteDPParForMR.runJob(_ID, _iterPredVar, _colocatedDPMatrix, program, resultFile, inputMatrix, inputDPF, inputOI, _tSparseCol, _enableCPCaching, _numThreads, _replicationDP);
    if (_monitor)
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
    // Step 5) collecting results from each parallel worker
    int numExecutedTasks = ret.getNumExecutedTasks();
    int numExecutedIterations = ret.getNumExecutedIterations();
    // consolidate results into global symbol table
    consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
    if (// see step 0
    flagForced)
        releaseForcedRecompile(0);
    inputMatrix.unsetPartitioned();
    if (_monitor) {
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
    }
}
Also used : OutputInfo(org.apache.sysml.runtime.matrix.data.OutputInfo) ParForBody(org.apache.sysml.runtime.controlprogram.parfor.ParForBody) RemoteParForJobReturn(org.apache.sysml.runtime.controlprogram.parfor.RemoteParForJobReturn) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ParForStatementBlock(org.apache.sysml.parser.ParForStatementBlock) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) TaskPartitioner(org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)

Example 60 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.

the class ParForProgramBlock method execute.

@Override
public void execute(ExecutionContext ec) {
    ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
    // evaluate from, to, incr only once (assumption: known at for entry)
    IntObject from = executePredicateInstructions(1, _fromInstructions, ec);
    IntObject to = executePredicateInstructions(2, _toInstructions, ec);
    IntObject incr = (_incrementInstructions == null || _incrementInstructions.isEmpty()) ? new IntObject((from.getLongValue() <= to.getLongValue()) ? 1 : -1) : executePredicateInstructions(3, _incrementInstructions, ec);
    if (// would produce infinite loop
    incr.getLongValue() == 0)
        throw new DMLRuntimeException(this.printBlockErrorLocation() + "Expression for increment " + "of variable '" + _iterPredVar + "' must evaluate to a non-zero value.");
    // early exit on num iterations = zero
    _numIterations = computeNumIterations(from, to, incr);
    if (_numIterations <= 0)
        // avoid unnecessary optimization/initialization
        return;
    // /////
    if (_optMode != POptMode.NONE) {
        // set optimizer log level
        OptimizationWrapper.setLogLevel(_optLogLevel);
        // core optimize
        OptimizationWrapper.optimize(_optMode, sb, this, ec, _monitor);
    }
    // /////
    // DATA PARTITIONING of read-only parent variables of type (matrix,unpartitioned)
    // /////
    Timing time = _monitor ? new Timing(true) : null;
    // partitioning on demand (note: for fused data partitioning and execute the optimizer set
    // the data partitioner to NONE in order to prevent any side effects)
    handleDataPartitioning(ec);
    // repartitioning of variables for spark cpmm/zipmm in order prevent unnecessary shuffle
    handleSparkRepartitioning(ec);
    // eager rdd caching of variables for spark in order prevent read/write contention
    handleSparkEagerCaching(ec);
    if (_monitor)
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_DATA_T, time.stop());
    // initialize iter var to form value
    IntObject iterVar = new IntObject(from.getLongValue());
    // /////
    // begin PARALLEL EXECUTION of (PAR)FOR body
    // /////
    LOG.trace("EXECUTE PARFOR ID = " + _ID + " with mode = " + _execMode + ", numThreads = " + _numThreads + ", taskpartitioner = " + _taskPartitioner);
    if (_monitor) {
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTHREADS, _numThreads);
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_TASKSIZE, _taskSize);
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_TASKPARTITIONER, _taskPartitioner.ordinal());
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_DATAPARTITIONER, _dataPartitioner.ordinal());
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_EXECMODE, _execMode.ordinal());
    }
    // preserve shared input/result variables of cleanup
    ArrayList<String> varList = ec.getVarList();
    boolean[] varState = ec.pinVariables(varList);
    try {
        switch(_execMode) {
            case // create parworkers as local threads
            LOCAL:
                executeLocalParFor(ec, iterVar, from, to, incr);
                break;
            case // create parworkers as MR tasks (one job per parfor)
            REMOTE_MR:
                executeRemoteMRParFor(ec, iterVar, from, to, incr);
                break;
            case // create parworkers as MR tasks (one job per parfor)
            REMOTE_MR_DP:
                executeRemoteMRParForDP(ec, iterVar, from, to, incr);
                break;
            case // create parworkers as Spark tasks (one job per parfor)
            REMOTE_SPARK:
                executeRemoteSparkParFor(ec, iterVar, from, to, incr);
                break;
            case // create parworkers as Spark tasks (one job per parfor)
            REMOTE_SPARK_DP:
                executeRemoteSparkParForDP(ec, iterVar, from, to, incr);
                break;
            default:
                throw new DMLRuntimeException("Undefined execution mode: '" + _execMode + "'.");
        }
    } catch (Exception ex) {
        throw new DMLRuntimeException("PARFOR: Failed to execute loop in parallel.", ex);
    }
    // reset state of shared input/result variables
    ec.unpinVariables(varList, varState);
    // cleanup unpinned shared variables
    cleanupSharedVariables(ec, varState);
    // set iteration var to TO value (+ increment) for FOR equivalence
    // consistent with for
    iterVar = new IntObject(to.getLongValue());
    ec.setVariable(_iterPredVar, iterVar);
    // we can replace those variables, because partitioning only applied for read-only matrices
    for (String var : _variablesDPOriginal.keySet()) {
        // cleanup partitioned matrix (if not reused)
        if (!_variablesDPReuse.keySet().contains(var))
            VariableCPInstruction.processRemoveVariableInstruction(ec, var);
        // reset to original matrix
        MatrixObject mo = (MatrixObject) _variablesDPOriginal.get(var);
        ec.setVariable(var, mo);
    }
    // print profiling report (only if top-level parfor because otherwise in parallel context)
    if (_monitorReport)
        LOG.info("\n" + StatisticMonitor.createReport());
    // TODO reset of hop parallelism constraint (e.g., ba+*)
    for (// release forced exectypes
    String dpvar : // release forced exectypes
    _variablesDPOriginal.keySet()) ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, dpvar, ec, false);
    // release forced exectypes for fused dp/exec
    if (_execMode == PExecMode.REMOTE_MR_DP || _execMode == PExecMode.REMOTE_SPARK_DP)
        ProgramRecompiler.rFindAndRecompileIndexingHOP(sb, this, _colocatedDPMatrix, ec, false);
    // after release, deletes dp_varnames
    resetOptimizerFlags();
    // execute exit instructions (usually empty)
    executeInstructions(_exitInstructions, ec);
}
Also used : IntObject(org.apache.sysml.runtime.instructions.cp.IntObject) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) ParForStatementBlock(org.apache.sysml.parser.ParForStatementBlock) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Aggregations

Timing (org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)78 IOException (java.io.IOException)31 ArrayList (java.util.ArrayList)29 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)25 HashMap (java.util.HashMap)24 Connection (org.apache.sysml.api.jmlc.Connection)17 PreparedScript (org.apache.sysml.api.jmlc.PreparedScript)17 ResultVariables (org.apache.sysml.api.jmlc.ResultVariables)17 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)17 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)14 ParForStatementBlock (org.apache.sysml.parser.ParForStatementBlock)10 TaskPartitioner (org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)10 ParForBody (org.apache.sysml.runtime.controlprogram.parfor.ParForBody)8 RemoteParForJobReturn (org.apache.sysml.runtime.controlprogram.parfor.RemoteParForJobReturn)8 LocalVariableMap (org.apache.sysml.runtime.controlprogram.LocalVariableMap)7 ProgramBlock (org.apache.sysml.runtime.controlprogram.ProgramBlock)7 ExecutorService (java.util.concurrent.ExecutorService)6 Future (java.util.concurrent.Future)6 LocalTaskQueue (org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue)6 Task (org.apache.sysml.runtime.controlprogram.parfor.Task)6