Search in sources :

Example 11 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.

the class ParForProgramBlock method consolidateAndCheckResults.

private void consolidateAndCheckResults(ExecutionContext ec, long expIters, long expTasks, long numIters, long numTasks, LocalVariableMap[] results) {
    Timing time = new Timing(true);
    // result merge
    if (checkParallelRemoteResultMerge()) {
        // execute result merge in parallel for all result vars
        int par = Math.min(_resultVars.size(), InfrastructureAnalyzer.getLocalParallelism());
        if (InfrastructureAnalyzer.isLocalMode()) {
            int parmem = (int) Math.floor(OptimizerUtils.getLocalMemBudget() / InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer());
            // reduce k if necessary
            par = Math.min(par, Math.max(parmem, 1));
        }
        try {
            // enqueue all result vars as tasks
            LocalTaskQueue<ResultVar> q = new LocalTaskQueue<>();
            for (ResultVar var : _resultVars) {
                // foreach non-local write
                if (// robustness scalars
                ec.getVariable(var._name) instanceof MatrixObject)
                    q.enqueueTask(var);
            }
            q.closeInput();
            // run result merge workers
            ResultMergeWorker[] rmWorkers = new ResultMergeWorker[par];
            for (int i = 0; i < par; i++) rmWorkers[i] = new ResultMergeWorker(q, results, ec);
            for (// start all
            int i = 0; // start all
            i < par; // start all
            i++) rmWorkers[i].start();
            for (int i = 0; i < par; i++) {
                // wait for all
                rmWorkers[i].join();
                if (!rmWorkers[i].finishedNoError())
                    throw new DMLRuntimeException("Error occured in parallel result merge worker.");
            }
        } catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
    } else {
        // execute result merge sequentially for all result vars
        for (// foreach non-local write
        ResultVar var : // foreach non-local write
        _resultVars) {
            Data dat = ec.getVariable(var._name);
            if (// robustness scalars
            dat instanceof MatrixObject) {
                MatrixObject out = (MatrixObject) dat;
                MatrixObject[] in = new MatrixObject[results.length];
                for (int i = 0; i < results.length; i++) in[i] = (MatrixObject) results[i].get(var._name);
                String fname = constructResultMergeFileName();
                ResultMerge rm = createResultMerge(_resultMerge, out, in, fname, var._isAccum, ec);
                MatrixObject outNew = null;
                if (USE_PARALLEL_RESULT_MERGE)
                    outNew = rm.executeParallelMerge(_numThreads);
                else
                    outNew = rm.executeSerialMerge();
                // cleanup existing var
                Data exdata = ec.removeVariable(var._name);
                if (exdata != null && exdata != outNew && exdata instanceof MatrixObject)
                    ec.cleanupCacheableData((MatrixObject) exdata);
                // cleanup of intermediate result variables
                cleanWorkerResultVariables(ec, out, in);
                // set merged result variable
                ec.setVariable(var._name, outNew);
            }
        }
    }
    // handle unscoped variables (vars created in parfor, but potentially used afterwards)
    ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
    if (// sb might be null for nested parallelism
    CREATE_UNSCOPED_RESULTVARS && sb != null && ec.getVariables() != null)
        createEmptyUnscopedVariables(ec.getVariables(), sb);
    // check expected counters
    if (// consistency check
    numTasks != expTasks || numIters != expIters)
        throw new DMLRuntimeException("PARFOR: Number of executed tasks does not match the number of created tasks: tasks " + numTasks + "/" + expTasks + ", iters " + numIters + "/" + expIters + ".");
    if (DMLScript.STATISTICS)
        Statistics.incrementParForMergeTime((long) time.stop());
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Data(org.apache.sysml.runtime.instructions.cp.Data) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) LocalTaskQueue(org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue) ResultVar(org.apache.sysml.parser.ParForStatementBlock.ResultVar) ResultMerge(org.apache.sysml.runtime.controlprogram.parfor.ResultMerge) ParForStatementBlock(org.apache.sysml.parser.ParForStatementBlock) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)

Example 12 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.

the class ParForProgramBlock method executeLocalParFor.

/**
 * Executes the parfor locally, i.e., the parfor is realized with numThreads local threads that drive execution.
 * This execution mode allows for arbitrary nested local parallelism and nested invocations of MR jobs. See
 * below for details of the realization.
 *
 * @param ec execution context
 * @param itervar ?
 * @param from ?
 * @param to ?
 * @param incr ?
 * @throws InterruptedException if InterruptedException occurs
 */
private void executeLocalParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws InterruptedException {
    LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads);
    /* Step 1) init parallel workers, task queue and threads
		 *         start threads (from now on waiting for tasks)
		 * Step 2) create tasks
		 *         put tasks into queue
		 *         mark end of task input stream
		 * Step 3) join all threads (wait for finished work)
		 * Step 4) collect results from each parallel worker
		 */
    Timing time = new Timing(true);
    int numExecutedTasks = 0;
    int numExecutedIterations = 0;
    // restrict recompilation to thread local memory
    setMemoryBudget();
    // enable runtime piggybacking if required
    if (_enableRuntimePiggybacking)
        // default piggybacking worker
        RuntimePiggybacking.start(_numThreads);
    try {
        // Step 1) create task queue and init workers in parallel
        // (including preparation of update-in-place variables)
        LocalTaskQueue<Task> queue = new LocalTaskQueue<>();
        Thread[] threads = new Thread[_numThreads];
        LocalParWorker[] workers = new LocalParWorker[_numThreads];
        IntStream.range(0, _numThreads).parallel().forEach(i -> {
            workers[i] = createParallelWorker(_pwIDs[i], queue, ec, i);
            threads[i] = new Thread(workers[i]);
            threads[i].setPriority(Thread.MAX_PRIORITY);
        });
        // start threads (from now on waiting for tasks)
        for (Thread thread : threads) thread.start();
        // maintain statistics
        long tinit = (long) time.stop();
        if (DMLScript.STATISTICS)
            Statistics.incrementParForInitTime(tinit);
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, tinit);
        // Step 2) create tasks
        TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
        long numIterations = partitioner.getNumIterations();
        long numCreatedTasks = -1;
        if (USE_STREAMING_TASK_CREATION) {
            // put tasks into queue (parworker start work on first tasks while creating tasks)
            numCreatedTasks = partitioner.createTasks(queue);
        } else {
            List<Task> tasks = partitioner.createTasks();
            numCreatedTasks = tasks.size();
            // put tasks into queue
            for (Task t : tasks) queue.enqueueTask(t);
            // mark end of task input stream
            queue.closeInput();
        }
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
        // Step 3) join all threads (wait for finished work)
        for (Thread thread : threads) thread.join();
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
        // Step 4) collecting results from each parallel worker
        // obtain results and cleanup other intermediates before result merge
        LocalVariableMap[] localVariables = new LocalVariableMap[_numThreads];
        for (int i = 0; i < _numThreads; i++) {
            localVariables[i] = workers[i].getVariables();
            localVariables[i].removeAllNotIn(_resultVars.stream().map(v -> v._name).collect(Collectors.toSet()));
            numExecutedTasks += workers[i].getExecutedTasks();
            numExecutedIterations += workers[i].getExecutedIterations();
        }
        // consolidate results into global symbol table
        consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, localVariables);
        // Step 5) cleanup local parworkers (e.g., remove created functions)
        for (int i = 0; i < _numThreads; i++) {
            Collection<String> fnNames = workers[i].getFunctionNames();
            if (fnNames != null && !fnNames.isEmpty())
                for (String fn : fnNames) {
                    String[] parts = DMLProgram.splitFunctionKey(fn);
                    _prog.removeFunctionProgramBlock(parts[0], parts[1]);
                }
        }
        // the main thread to use the GPUContext
        if (DMLScript.USE_ACCELERATOR) {
            ec.getGPUContext(0).initializeThread();
        }
    } finally {
        // remove thread-local memory budget (reset to original budget)
        // (in finally to prevent error side effects for multiple scripts in one jvm)
        resetMemoryBudget();
        // disable runtime piggybacking
        if (_enableRuntimePiggybacking)
            RuntimePiggybacking.stop();
        if (_monitor) {
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
        }
    }
}
Also used : Task(org.apache.sysml.runtime.controlprogram.parfor.Task) LocalTaskQueue(org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue) LocalParWorker(org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) TaskPartitioner(org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)

Example 13 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.

the class CompressedMatrixBlock method compress.

/**
 * Compress block.
 *
 * @param k  number of threads
 * @return compressed matrix block or original block if incompressible
 */
public MatrixBlock compress(int k) {
    // check for redundant compression
    if (isCompressed()) {
        throw new DMLRuntimeException("Redundant compression, block already compressed.");
    }
    Timing time = new Timing(true);
    _stats = new CompressionStatistics();
    // SAMPLE-BASED DECISIONS:
    // Decisions such as testing if a column is amenable to bitmap
    // compression or evaluating co-coding potentionls are made based on a
    // subset of the rows. For large datasets, sampling might take a
    // significant amount of time. So, we generate only one sample and use
    // it for the entire compression process.
    // prepare basic meta data and deep copy / transpose input
    final int numRows = getNumRows();
    final int numCols = getNumColumns();
    final boolean sparse = isInSparseFormat();
    MatrixBlock rawblock = !TRANSPOSE_INPUT ? new MatrixBlock(this) : LibMatrixReorg.transpose(this, new MatrixBlock(numCols, numRows, sparse), k);
    // construct sample-based size estimator
    CompressedSizeEstimator bitmapSizeEstimator = SizeEstimatorFactory.getSizeEstimator(rawblock, numRows);
    // PHASE 1: Classify columns by compression type
    // We start by determining which columns are amenable to compression
    List<Integer> colsC = new ArrayList<>();
    List<Integer> colsUC = new ArrayList<>();
    HashMap<Integer, Double> compRatios = new HashMap<>();
    // Classify columns according to ratio (size uncompressed / size compressed),
    // where a column is compressible if ratio > 1.
    CompressedSizeInfo[] sizeInfos = (k > 1) ? computeCompressedSizeInfos(bitmapSizeEstimator, numCols, k) : computeCompressedSizeInfos(bitmapSizeEstimator, numCols);
    long nnzUC = 0;
    for (int col = 0; col < numCols; col++) {
        double uncompSize = getUncompressedSize(numRows, 1, OptimizerUtils.getSparsity(numRows, 1, sizeInfos[col].getEstNnz()));
        double compRatio = uncompSize / sizeInfos[col].getMinSize();
        if (compRatio > 1) {
            colsC.add(col);
            compRatios.put(col, compRatio);
        } else {
            colsUC.add(col);
            nnzUC += sizeInfos[col].getEstNnz();
        }
    }
    // correction of column classification (reevaluate dense estimates if necessary)
    boolean sparseUC = MatrixBlock.evalSparseFormatInMemory(numRows, colsUC.size(), nnzUC);
    if (!sparseUC && !colsUC.isEmpty()) {
        for (int i = 0; i < colsUC.size(); i++) {
            int col = colsUC.get(i);
            double uncompSize = getUncompressedSize(numRows, 1, 1.0);
            double compRatio = uncompSize / sizeInfos[col].getMinSize();
            if (compRatio > 1) {
                colsC.add(col);
                colsUC.remove(i);
                i--;
                compRatios.put(col, compRatio);
                nnzUC -= sizeInfos[col].getEstNnz();
            }
        }
    }
    if (LOG.isTraceEnabled()) {
        LOG.trace("C: " + Arrays.toString(colsC.toArray(new Integer[0])));
        LOG.trace("-- compression ratios: " + Arrays.toString(colsC.stream().map(c -> compRatios.get(c)).toArray()));
        LOG.trace("UC: " + Arrays.toString(colsUC.toArray(new Integer[0])));
        LOG.trace("-- compression ratios: " + Arrays.toString(colsUC.stream().map(c -> compRatios.get(c)).toArray()));
    }
    if (LOG.isDebugEnabled()) {
        _stats.timePhase1 = time.stop();
        LOG.debug("Compression statistics:");
        LOG.debug("--compression phase 1: " + _stats.timePhase1);
    }
    if (colsC.isEmpty()) {
        if (LOG.isDebugEnabled())
            LOG.debug("Abort block compression because all columns are incompressible.");
        return new MatrixBlock().copyShallow(this);
    }
    // PHASE 2: Grouping columns
    // Divide the bitmap columns into column groups.
    List<int[]> bitmapColGrps = PlanningCoCoder.findCocodesByPartitioning(bitmapSizeEstimator, colsC, sizeInfos, numRows, k);
    if (LOG.isDebugEnabled()) {
        _stats.timePhase2 = time.stop();
        LOG.debug("--compression phase 2: " + _stats.timePhase2);
    }
    if (INVESTIGATE_ESTIMATES) {
        double est = 0;
        for (int[] groupIndices : bitmapColGrps) est += bitmapSizeEstimator.estimateCompressedColGroupSize(groupIndices).getMinSize();
        est += MatrixBlock.estimateSizeInMemory(numRows, colsUC.size(), OptimizerUtils.getSparsity(numRows, colsUC.size(), nnzUC));
        _stats.estSize = est;
    }
    // PHASE 3: Compress and correct sample-based decisions
    ColGroup[] colGroups = (k > 1) ? compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty(), k) : compressColGroups(rawblock, bitmapSizeEstimator, compRatios, numRows, bitmapColGrps, colsUC.isEmpty());
    allocateColGroupList();
    HashSet<Integer> remainingCols = seq(0, numCols - 1, 1);
    for (int j = 0; j < colGroups.length; j++) {
        if (colGroups[j] != null) {
            for (int col : colGroups[j].getColIndices()) remainingCols.remove(col);
            _colGroups.add(colGroups[j]);
        }
    }
    if (LOG.isDebugEnabled()) {
        _stats.timePhase3 = time.stop();
        LOG.debug("--compression phase 3: " + _stats.timePhase3);
    }
    // PHASE 4: Best-effort dictionary sharing for DDC1 single-col groups
    double[] dict = createSharedDDC1Dictionary(_colGroups);
    if (dict != null) {
        applySharedDDC1Dictionary(_colGroups, dict);
        _sharedDDC1Dict = true;
    }
    if (LOG.isDebugEnabled()) {
        _stats.timePhase4 = time.stop();
        LOG.debug("--compression phase 4: " + _stats.timePhase4);
    }
    // The remaining columns are stored uncompressed as one big column group
    if (!remainingCols.isEmpty()) {
        ArrayList<Integer> list = new ArrayList<>(remainingCols);
        ColGroupUncompressed ucgroup = new ColGroupUncompressed(list, rawblock);
        _colGroups.add(ucgroup);
    }
    _stats.size = estimateCompressedSizeInMemory();
    _stats.ratio = estimateSizeInMemory() / _stats.size;
    if (_stats.ratio < 1) {
        if (LOG.isDebugEnabled())
            LOG.debug("Abort block compression because compression ratio is less than 1.");
        return new MatrixBlock().copyShallow(this);
    }
    // final cleanup (discard uncompressed block)
    rawblock.cleanupBlock(true, true);
    this.cleanupBlock(true, true);
    if (LOG.isDebugEnabled()) {
        _stats.timePhase5 = time.stop();
        int[] counts = getColGroupCounts(_colGroups);
        LOG.debug("--compression phase 5: " + _stats.timePhase5);
        LOG.debug("--num col groups: " + _colGroups.size());
        LOG.debug("--col groups types (OLE,RLE,DDC1,DDC2,UC): " + counts[2] + "," + counts[1] + "," + counts[3] + "," + counts[4] + "," + counts[0]);
        LOG.debug("--col groups sizes (OLE,RLE,DDC1,DDC2,UC): " + counts[7] + "," + counts[6] + "," + counts[8] + "," + counts[9] + "," + counts[5]);
        LOG.debug("--compressed size: " + _stats.size);
        LOG.debug("--compression ratio: " + _stats.ratio);
    }
    return this;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CompressedSizeInfo(org.apache.sysml.runtime.compress.estim.CompressedSizeInfo) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) CompressedSizeEstimator(org.apache.sysml.runtime.compress.estim.CompressedSizeEstimator) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)

Example 14 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.

the class CompressedMatrixBlock method aggregateUnaryOperations.

@Override
public MatrixValue aggregateUnaryOperations(AggregateUnaryOperator op, MatrixValue result, int blockingFactorRow, int blockingFactorCol, MatrixIndexes indexesIn, boolean inCP) {
    // call uncompressed matrix mult if necessary
    if (!isCompressed()) {
        return super.aggregateUnaryOperations(op, result, blockingFactorRow, blockingFactorCol, indexesIn, inCP);
    }
    // check for supported operations
    if (!(op.aggOp.increOp.fn instanceof KahanPlus || op.aggOp.increOp.fn instanceof KahanPlusSq || (op.aggOp.increOp.fn instanceof Builtin && (((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MIN || ((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MAX)))) {
        throw new DMLRuntimeException("Unary aggregates other than sum/sumsq/min/max not supported yet.");
    }
    Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
    // prepare output dimensions
    CellIndex tempCellIndex = new CellIndex(-1, -1);
    op.indexFn.computeDimension(rlen, clen, tempCellIndex);
    if (op.aggOp.correctionExists) {
        switch(op.aggOp.correctionLocation) {
            case LASTROW:
                tempCellIndex.row++;
                break;
            case LASTCOLUMN:
                tempCellIndex.column++;
                break;
            case LASTTWOROWS:
                tempCellIndex.row += 2;
                break;
            case LASTTWOCOLUMNS:
                tempCellIndex.column += 2;
                break;
            default:
                throw new DMLRuntimeException("unrecognized correctionLocation: " + op.aggOp.correctionLocation);
        }
    }
    // initialize and allocate the result
    if (result == null)
        result = new MatrixBlock(tempCellIndex.row, tempCellIndex.column, false);
    else
        result.reset(tempCellIndex.row, tempCellIndex.column, false);
    MatrixBlock ret = (MatrixBlock) result;
    ret.allocateDenseBlock();
    // special handling init value for rowmins/rowmax
    if (op.indexFn instanceof ReduceCol && op.aggOp.increOp.fn instanceof Builtin) {
        double val = (((Builtin) op.aggOp.increOp.fn).getBuiltinCode() == BuiltinCode.MAX) ? Double.NEGATIVE_INFINITY : Double.POSITIVE_INFINITY;
        ret.getDenseBlock().set(val);
    }
    // core unary aggregate
    if (op.getNumThreads() > 1 && getExactSizeOnDisk() > MIN_PAR_AGG_THRESHOLD) {
        // multi-threaded execution of all groups
        ArrayList<ColGroup>[] grpParts = createStaticTaskPartitioning((op.indexFn instanceof ReduceCol) ? 1 : op.getNumThreads(), false);
        ColGroupUncompressed uc = getUncompressedColGroup();
        try {
            // compute uncompressed column group in parallel (otherwise bottleneck)
            if (uc != null)
                uc.unaryAggregateOperations(op, ret);
            // compute all compressed column groups
            ExecutorService pool = CommonThreadPool.get(op.getNumThreads());
            ArrayList<UnaryAggregateTask> tasks = new ArrayList<>();
            if (op.indexFn instanceof ReduceCol && grpParts.length > 0) {
                int blklen = BitmapEncoder.getAlignedBlocksize((int) (Math.ceil((double) rlen / op.getNumThreads())));
                for (int i = 0; i < op.getNumThreads() & i * blklen < rlen; i++) tasks.add(new UnaryAggregateTask(grpParts[0], ret, i * blklen, Math.min((i + 1) * blklen, rlen), op));
            } else
                for (ArrayList<ColGroup> grp : grpParts) tasks.add(new UnaryAggregateTask(grp, ret, 0, rlen, op));
            List<Future<MatrixBlock>> rtasks = pool.invokeAll(tasks);
            pool.shutdown();
            // aggregate partial results
            if (op.indexFn instanceof ReduceAll) {
                if (op.aggOp.increOp.fn instanceof KahanFunction) {
                    KahanObject kbuff = new KahanObject(ret.quickGetValue(0, 0), 0);
                    for (Future<MatrixBlock> rtask : rtasks) {
                        double tmp = rtask.get().quickGetValue(0, 0);
                        ((KahanFunction) op.aggOp.increOp.fn).execute2(kbuff, tmp);
                    }
                    ret.quickSetValue(0, 0, kbuff._sum);
                } else {
                    double val = ret.quickGetValue(0, 0);
                    for (Future<MatrixBlock> rtask : rtasks) {
                        double tmp = rtask.get().quickGetValue(0, 0);
                        val = op.aggOp.increOp.fn.execute(val, tmp);
                    }
                    ret.quickSetValue(0, 0, val);
                }
            }
        } catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
    } else {
        // process UC column group
        for (ColGroup grp : _colGroups) if (grp instanceof ColGroupUncompressed)
            grp.unaryAggregateOperations(op, ret);
        // process OLE/RLE column groups
        aggregateUnaryOperations(op, _colGroups, ret, 0, rlen);
    }
    // special handling zeros for rowmins/rowmax
    if (op.indexFn instanceof ReduceCol && op.aggOp.increOp.fn instanceof Builtin) {
        int[] rnnz = new int[rlen];
        for (ColGroup grp : _colGroups) grp.countNonZerosPerRow(rnnz, 0, rlen);
        Builtin builtin = (Builtin) op.aggOp.increOp.fn;
        for (int i = 0; i < rlen; i++) if (rnnz[i] < clen)
            ret.quickSetValue(i, 0, builtin.execute2(ret.quickGetValue(i, 0), 0));
    }
    // drop correction if necessary
    if (op.aggOp.correctionExists && inCP)
        ret.dropLastRowsOrColumns(op.aggOp.correctionLocation);
    // post-processing
    ret.recomputeNonZeros();
    if (LOG.isDebugEnabled())
        LOG.debug("Compressed uagg k=" + op.getNumThreads() + " in " + time.stop());
    return ret;
}
Also used : ReduceAll(org.apache.sysml.runtime.functionobjects.ReduceAll) MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) ArrayList(java.util.ArrayList) KahanFunction(org.apache.sysml.runtime.functionobjects.KahanFunction) KahanPlusSq(org.apache.sysml.runtime.functionobjects.KahanPlusSq) ReduceCol(org.apache.sysml.runtime.functionobjects.ReduceCol) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) ExecutorService(java.util.concurrent.ExecutorService) KahanObject(org.apache.sysml.runtime.instructions.cp.KahanObject) KahanPlus(org.apache.sysml.runtime.functionobjects.KahanPlus) Future(java.util.concurrent.Future) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) Builtin(org.apache.sysml.runtime.functionobjects.Builtin)

Example 15 with Timing

use of org.apache.sysml.runtime.controlprogram.parfor.stat.Timing in project incubator-systemml by apache.

the class CompressedMatrixBlock method transposeSelfMatrixMultOperations.

@Override
public MatrixBlock transposeSelfMatrixMultOperations(MatrixBlock out, MMTSJType tstype, int k) {
    // call uncompressed matrix mult if necessary
    if (!isCompressed()) {
        return super.transposeSelfMatrixMultOperations(out, tstype, k);
    }
    // multi-threaded tsmm of single uncompressed colgroup
    if (isSingleUncompressedGroup()) {
        return ((ColGroupUncompressed) _colGroups.get(0)).getData().transposeSelfMatrixMultOperations(out, tstype, k);
    }
    Timing time = LOG.isDebugEnabled() ? new Timing(true) : null;
    // check for transpose type
    if (// right not supported yet
    tstype != MMTSJType.LEFT)
        throw new DMLRuntimeException("Invalid MMTSJ type '" + tstype.toString() + "'.");
    // create output matrix block
    if (out == null)
        out = new MatrixBlock(clen, clen, false);
    else
        out.reset(clen, clen, false);
    out.allocateDenseBlock();
    if (!isEmptyBlock(false)) {
        // compute matrix mult
        try {
            ExecutorService pool = CommonThreadPool.get(k);
            ArrayList<MatrixMultTransposeTask> tasks = new ArrayList<>();
            int numgrp = _colGroups.size();
            int blklen = (int) (Math.ceil((double) numgrp / (2 * k)));
            for (int i = 0; i < 2 * k & i * blklen < clen; i++) tasks.add(new MatrixMultTransposeTask(_colGroups, out, i * blklen, Math.min((i + 1) * blklen, numgrp)));
            List<Future<Object>> ret = pool.invokeAll(tasks);
            for (Future<Object> tret : ret) // check for errors
            tret.get();
            pool.shutdown();
        } catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
        // post-processing
        out.setNonZeros(LinearAlgebraUtils.copyUpperToLowerTriangle(out));
    }
    if (LOG.isDebugEnabled())
        LOG.debug("Compressed TSMM k=" + k + " in " + time.stop());
    return out;
}
Also used : MatrixBlock(org.apache.sysml.runtime.matrix.data.MatrixBlock) ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) KahanObject(org.apache.sysml.runtime.instructions.cp.KahanObject) ScalarObject(org.apache.sysml.runtime.instructions.cp.ScalarObject) CM_COV_Object(org.apache.sysml.runtime.instructions.cp.CM_COV_Object) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)

Aggregations

Timing (org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)78 IOException (java.io.IOException)31 ArrayList (java.util.ArrayList)29 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)25 HashMap (java.util.HashMap)24 Connection (org.apache.sysml.api.jmlc.Connection)17 PreparedScript (org.apache.sysml.api.jmlc.PreparedScript)17 ResultVariables (org.apache.sysml.api.jmlc.ResultVariables)17 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)17 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)14 ParForStatementBlock (org.apache.sysml.parser.ParForStatementBlock)10 TaskPartitioner (org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)10 ParForBody (org.apache.sysml.runtime.controlprogram.parfor.ParForBody)8 RemoteParForJobReturn (org.apache.sysml.runtime.controlprogram.parfor.RemoteParForJobReturn)8 LocalVariableMap (org.apache.sysml.runtime.controlprogram.LocalVariableMap)7 ProgramBlock (org.apache.sysml.runtime.controlprogram.ProgramBlock)7 ExecutorService (java.util.concurrent.ExecutorService)6 Future (java.util.concurrent.Future)6 LocalTaskQueue (org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue)6 Task (org.apache.sysml.runtime.controlprogram.parfor.Task)6