Search in sources :

Example 1 with LocalTaskQueue

use of org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue in project incubator-systemml by apache.

the class ParForProgramBlock method consolidateAndCheckResults.

private void consolidateAndCheckResults(ExecutionContext ec, long expIters, long expTasks, long numIters, long numTasks, LocalVariableMap[] results) throws DMLRuntimeException {
    Timing time = new Timing(true);
    //result merge
    if (checkParallelRemoteResultMerge()) {
        //execute result merge in parallel for all result vars
        int par = Math.min(_resultVars.size(), InfrastructureAnalyzer.getLocalParallelism());
        if (InfrastructureAnalyzer.isLocalMode()) {
            int parmem = (int) Math.floor(OptimizerUtils.getLocalMemBudget() / InfrastructureAnalyzer.getRemoteMaxMemorySortBuffer());
            //reduce k if necessary
            par = Math.min(par, Math.max(parmem, 1));
        }
        try {
            //enqueue all result vars as tasks
            LocalTaskQueue<String> q = new LocalTaskQueue<String>();
            for (//foreach non-local write
            String var : //foreach non-local write
            _resultVars) if (//robustness scalars
            ec.getVariable(var) instanceof MatrixObject)
                q.enqueueTask(var);
            q.closeInput();
            //run result merge workers
            ResultMergeWorker[] rmWorkers = new ResultMergeWorker[par];
            for (int i = 0; i < par; i++) rmWorkers[i] = new ResultMergeWorker(q, results, ec);
            for (//start all
            int i = 0; //start all
            i < par; //start all
            i++) rmWorkers[i].start();
            for (int i = 0; i < par; i++) {
                //wait for all
                rmWorkers[i].join();
                if (!rmWorkers[i].finishedNoError())
                    throw new DMLRuntimeException("Error occured in parallel result merge worker.");
            }
        } catch (Exception ex) {
            throw new DMLRuntimeException(ex);
        }
    } else {
        //execute result merge sequentially for all result vars
        for (//foreach non-local write
        String var : //foreach non-local write
        _resultVars) {
            Data dat = ec.getVariable(var);
            if (//robustness scalars
            dat instanceof MatrixObject) {
                MatrixObject out = (MatrixObject) dat;
                MatrixObject[] in = new MatrixObject[results.length];
                for (int i = 0; i < results.length; i++) in[i] = (MatrixObject) results[i].get(var);
                String fname = constructResultMergeFileName();
                ResultMerge rm = createResultMerge(_resultMerge, out, in, fname, ec);
                MatrixObject outNew = null;
                if (USE_PARALLEL_RESULT_MERGE)
                    outNew = rm.executeParallelMerge(_numThreads);
                else
                    outNew = rm.executeSerialMerge();
                //cleanup existing var
                Data exdata = ec.removeVariable(var);
                if (exdata != null && exdata != outNew && exdata instanceof MatrixObject)
                    ec.cleanupMatrixObject((MatrixObject) exdata);
                //cleanup of intermediate result variables
                cleanWorkerResultVariables(ec, out, in);
                //set merged result variable
                ec.setVariable(var, outNew);
            }
        }
    }
    //handle unscoped variables (vars created in parfor, but potentially used afterwards)
    ParForStatementBlock sb = (ParForStatementBlock) getStatementBlock();
    if (//sb might be null for nested parallelism
    CREATE_UNSCOPED_RESULTVARS && sb != null && ec.getVariables() != null)
        createEmptyUnscopedVariables(ec.getVariables(), sb);
    //check expected counters
    if (//consistency check
    numTasks != expTasks || numIters != expIters)
        throw new DMLRuntimeException("PARFOR: Number of executed tasks does not match the number of created tasks: tasks " + numTasks + "/" + expTasks + ", iters " + numIters + "/" + expIters + ".");
    if (DMLScript.STATISTICS)
        Statistics.incrementParForMergeTime((long) time.stop());
}
Also used : MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) Data(org.apache.sysml.runtime.instructions.cp.Data) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) CacheException(org.apache.sysml.runtime.controlprogram.caching.CacheException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) LocalTaskQueue(org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue) ResultMerge(org.apache.sysml.runtime.controlprogram.parfor.ResultMerge) ParForStatementBlock(org.apache.sysml.parser.ParForStatementBlock) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)

Example 2 with LocalTaskQueue

use of org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue in project incubator-systemml by apache.

the class ParForProgramBlock method executeLocalParFor.

/**
	 * Executes the parfor locally, i.e., the parfor is realized with numThreads local threads that drive execution.
	 * This execution mode allows for arbitrary nested local parallelism and nested invocations of MR jobs. See
	 * below for details of the realization.
	 * 
	 * @param ec execution context
	 * @param itervar ?
	 * @param from ?
	 * @param to ?
	 * @param incr ?
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 * @throws InterruptedException if InterruptedException occurs
	 */
private void executeLocalParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws DMLRuntimeException, InterruptedException {
    LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads);
    /* Step 1) init parallel workers, task queue and threads
		 *         start threads (from now on waiting for tasks)
		 * Step 2) create tasks
		 *         put tasks into queue
		 *         mark end of task input stream
		 * Step 3) join all threads (wait for finished work)
		 * Step 4) collect results from each parallel worker
		 */
    Timing time = new Timing(true);
    int numExecutedTasks = 0;
    int numExecutedIterations = 0;
    //restrict recompilation to thread local memory
    setMemoryBudget();
    //enable runtime piggybacking if required
    if (_enableRuntimePiggybacking)
        //default piggybacking worker
        RuntimePiggybacking.start(_numThreads);
    try {
        // Step 1) init parallel workers, task queue and threads
        LocalTaskQueue<Task> queue = new LocalTaskQueue<Task>();
        Thread[] threads = new Thread[_numThreads];
        LocalParWorker[] workers = new LocalParWorker[_numThreads];
        for (int i = 0; i < _numThreads; i++) {
            //create parallel workers as (lazy) deep copies
            //including preparation of update-in-place variables
            workers[i] = createParallelWorker(_pwIDs[i], queue, ec);
            threads[i] = new Thread(workers[i]);
            threads[i].setPriority(Thread.MAX_PRIORITY);
        }
        // start threads (from now on waiting for tasks)
        for (Thread thread : threads) thread.start();
        //maintain statistics
        long tinit = (long) time.stop();
        if (DMLScript.STATISTICS)
            Statistics.incrementParForInitTime(tinit);
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, tinit);
        // Step 2) create tasks 
        TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
        long numIterations = partitioner.getNumIterations();
        long numCreatedTasks = -1;
        if (USE_STREAMING_TASK_CREATION) {
            //put tasks into queue (parworker start work on first tasks while creating tasks) 
            numCreatedTasks = partitioner.createTasks(queue);
        } else {
            List<Task> tasks = partitioner.createTasks();
            numCreatedTasks = tasks.size();
            // put tasks into queue
            for (Task t : tasks) queue.enqueueTask(t);
            // mark end of task input stream
            queue.closeInput();
        }
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
        // Step 3) join all threads (wait for finished work)
        for (Thread thread : threads) thread.join();
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
        // Step 4) collecting results from each parallel worker
        //obtain results
        LocalVariableMap[] localVariables = new LocalVariableMap[_numThreads];
        for (int i = 0; i < _numThreads; i++) {
            localVariables[i] = workers[i].getVariables();
            numExecutedTasks += workers[i].getExecutedTasks();
            numExecutedIterations += workers[i].getExecutedIterations();
        }
        //consolidate results into global symbol table
        consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, localVariables);
        // Step 5) cleanup local parworkers (e.g., remove created functions)
        for (int i = 0; i < _numThreads; i++) {
            Collection<String> fnNames = workers[i].getFunctionNames();
            if (fnNames != null && !fnNames.isEmpty())
                for (String fn : fnNames) {
                    String[] parts = DMLProgram.splitFunctionKey(fn);
                    _prog.removeFunctionProgramBlock(parts[0], parts[1]);
                }
        }
        // the main thread to use the GPUContext
        if (DMLScript.USE_ACCELERATOR) {
            for (int i = 0; i < _numThreads; i++) {
                GPUContext gCtx = workers[i].getExecutionContext().getGPUContext();
                GPUContextPool.returnToPool(gCtx);
            }
            ec.setGPUContext(GPUContextPool.getFromPool());
            ec.getGPUContext().initializeThread();
        }
    } finally {
        //remove thread-local memory budget (reset to original budget)
        //(in finally to prevent error side effects for multiple scripts in one jvm)
        resetMemoryBudget();
        //disable runtime piggybacking
        if (_enableRuntimePiggybacking)
            RuntimePiggybacking.stop();
        if (_monitor) {
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
        }
    }
}
Also used : Task(org.apache.sysml.runtime.controlprogram.parfor.Task) LocalTaskQueue(org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue) LocalParWorker(org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker) GPUContext(org.apache.sysml.runtime.instructions.gpu.context.GPUContext) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) TaskPartitioner(org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)

Example 3 with LocalTaskQueue

use of org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue in project incubator-systemml by apache.

the class ParForProgramBlock method executeRemoteMRParFor.

private void executeRemoteMRParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws DMLRuntimeException, IOException {
    /* Step 0) check and recompile MR inst
		 * Step 1) serialize child PB and inst
		 * Step 2) create tasks
		 *         serialize tasks
		 * Step 3) submit MR Jobs and wait for results                        
		 * Step 4) collect results from each parallel worker
		 */
    Timing time = (_monitor ? new Timing(true) : null);
    // Step 0) check and compile to CP (if forced remote parfor)
    boolean flagForced = false;
    if (FORCE_CP_ON_REMOTE_MR && (_optMode == POptMode.NONE || (_optMode == POptMode.CONSTRAINED && _execMode == PExecMode.REMOTE_MR))) {
        //tid = 0  because replaced in remote parworker
        flagForced = checkMRAndRecompileToCP(0);
    }
    // Step 1) init parallel workers (serialize PBs)
    // NOTES: each mapper changes filenames with regard to his ID as we submit a single job,
    //        cannot reuse serialized string, since variables are serialized as well.
    ParForBody body = new ParForBody(_childBlocks, _resultVars, ec);
    String program = ProgramConverter.serializeParForBody(body);
    if (_monitor)
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, time.stop());
    // Step 2) create tasks 
    TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
    String taskFile = constructTaskFileName();
    String resultFile = constructResultFileName();
    long numIterations = partitioner.getNumIterations();
    int maxDigits = (int) Math.log10(to.getLongValue()) + 1;
    long numCreatedTasks = -1;
    if (USE_STREAMING_TASK_CREATION) {
        LocalTaskQueue<Task> queue = new LocalTaskQueue<Task>();
        //put tasks into queue and start writing to taskFile
        numCreatedTasks = partitioner.createTasks(queue);
        taskFile = writeTasksToFile(taskFile, queue, maxDigits);
    } else {
        //sequentially create tasks and write to disk
        List<Task> tasks = partitioner.createTasks();
        numCreatedTasks = tasks.size();
        taskFile = writeTasksToFile(taskFile, tasks, maxDigits);
    }
    if (_monitor)
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
    //write matrices to HDFS 
    exportMatricesToHDFS(ec);
    // Step 3) submit MR job (wait for finished work)
    MatrixObject colocatedDPMatrixObj = (_colocatedDPMatrix != null) ? ec.getMatrixObject(_colocatedDPMatrix) : null;
    RemoteParForJobReturn ret = RemoteParForMR.runJob(_ID, program, taskFile, resultFile, colocatedDPMatrixObj, _enableCPCaching, _numThreads, WRITE_REPLICATION_FACTOR, MAX_RETRYS_ON_ERROR, getMinMemory(ec), (ALLOW_REUSE_MR_JVMS & _jvmReuse));
    if (_monitor)
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
    // Step 4) collecting results from each parallel worker
    int numExecutedTasks = ret.getNumExecutedTasks();
    int numExecutedIterations = ret.getNumExecutedIterations();
    //consolidate results into global symbol table
    consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, ret.getVariables());
    if (//see step 0
    flagForced)
        releaseForcedRecompile(0);
    if (_monitor) {
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
        StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
    }
}
Also used : ParForBody(org.apache.sysml.runtime.controlprogram.parfor.ParForBody) Task(org.apache.sysml.runtime.controlprogram.parfor.Task) MatrixObject(org.apache.sysml.runtime.controlprogram.caching.MatrixObject) RemoteParForJobReturn(org.apache.sysml.runtime.controlprogram.parfor.RemoteParForJobReturn) LocalTaskQueue(org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) TaskPartitioner(org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)

Aggregations

LocalTaskQueue (org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue)3 Timing (org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)3 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)2 Task (org.apache.sysml.runtime.controlprogram.parfor.Task)2 TaskPartitioner (org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)2 IOException (java.io.IOException)1 ParForStatementBlock (org.apache.sysml.parser.ParForStatementBlock)1 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)1 CacheException (org.apache.sysml.runtime.controlprogram.caching.CacheException)1 LocalParWorker (org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker)1 ParForBody (org.apache.sysml.runtime.controlprogram.parfor.ParForBody)1 RemoteParForJobReturn (org.apache.sysml.runtime.controlprogram.parfor.RemoteParForJobReturn)1 ResultMerge (org.apache.sysml.runtime.controlprogram.parfor.ResultMerge)1 Data (org.apache.sysml.runtime.instructions.cp.Data)1 GPUContext (org.apache.sysml.runtime.instructions.gpu.context.GPUContext)1