Search in sources :

Example 1 with LocalParWorker

use of org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker in project incubator-systemml by apache.

the class ParForProgramBlock method createParallelWorker.

/**
 * Creates a new or partially recycled instance of a parallel worker. Therefore the symbol table, and child
 * program blocks are deep copied. Note that entries of the symbol table are not deep copied because they are replaced
 * anyway on the next write. In case of recycling the deep copies of program blocks are recycled from previous
 * executions of this parfor.
 *
 * @param pwID parworker id
 * @param queue task queue
 * @param ec execution context
 * @param index the index of the worker
 * @return local parworker
 */
private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec, int index) {
    LocalParWorker pw = null;
    try {
        // create deep copies of required elements child blocks
        ArrayList<ProgramBlock> cpChildBlocks = null;
        HashSet<String> fnNames = new HashSet<>();
        if (USE_PB_CACHE) {
            if (_pbcache.containsKey(pwID)) {
                cpChildBlocks = _pbcache.get(pwID);
            } else {
                cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
                _pbcache.put(pwID, cpChildBlocks);
            }
        } else {
            cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
        }
        // deep copy execution context (including prepare parfor update-in-place)
        ExecutionContext cpEc = ProgramConverter.createDeepCopyExecutionContext(ec);
        // and sets it in the ExecutionContext of the parfor
        if (DMLScript.USE_ACCELERATOR) {
            cpEc.setGPUContexts(Arrays.asList(ec.getGPUContext(index)));
        }
        // prepare basic update-in-place variables (vars dropped on result merge)
        prepareUpdateInPlaceVariables(cpEc, pwID);
        // copy compiler configuration (for jmlc w/o global config)
        CompilerConfig cconf = ConfigurationManager.getCompilerConfig();
        // create the actual parallel worker
        ParForBody body = new ParForBody(cpChildBlocks, _resultVars, cpEc);
        pw = new LocalParWorker(pwID, queue, body, cconf, MAX_RETRYS_ON_ERROR, _monitor);
        pw.setFunctionNames(fnNames);
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    }
    return pw;
}
Also used : ParForBody(org.apache.sysml.runtime.controlprogram.parfor.ParForBody) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) ExecutionContext(org.apache.sysml.runtime.controlprogram.context.ExecutionContext) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) LocalParWorker(org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker) CompilerConfig(org.apache.sysml.conf.CompilerConfig) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) HashSet(java.util.HashSet)

Example 2 with LocalParWorker

use of org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker in project incubator-systemml by apache.

the class ParForProgramBlock method executeLocalParFor.

/**
 * Executes the parfor locally, i.e., the parfor is realized with numThreads local threads that drive execution.
 * This execution mode allows for arbitrary nested local parallelism and nested invocations of MR jobs. See
 * below for details of the realization.
 *
 * @param ec execution context
 * @param itervar ?
 * @param from ?
 * @param to ?
 * @param incr ?
 * @throws InterruptedException if InterruptedException occurs
 */
private void executeLocalParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws InterruptedException {
    LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads);
    /* Step 1) init parallel workers, task queue and threads
		 *         start threads (from now on waiting for tasks)
		 * Step 2) create tasks
		 *         put tasks into queue
		 *         mark end of task input stream
		 * Step 3) join all threads (wait for finished work)
		 * Step 4) collect results from each parallel worker
		 */
    Timing time = new Timing(true);
    int numExecutedTasks = 0;
    int numExecutedIterations = 0;
    // restrict recompilation to thread local memory
    setMemoryBudget();
    // enable runtime piggybacking if required
    if (_enableRuntimePiggybacking)
        // default piggybacking worker
        RuntimePiggybacking.start(_numThreads);
    try {
        // Step 1) create task queue and init workers in parallel
        // (including preparation of update-in-place variables)
        LocalTaskQueue<Task> queue = new LocalTaskQueue<>();
        Thread[] threads = new Thread[_numThreads];
        LocalParWorker[] workers = new LocalParWorker[_numThreads];
        IntStream.range(0, _numThreads).parallel().forEach(i -> {
            workers[i] = createParallelWorker(_pwIDs[i], queue, ec, i);
            threads[i] = new Thread(workers[i]);
            threads[i].setPriority(Thread.MAX_PRIORITY);
        });
        // start threads (from now on waiting for tasks)
        for (Thread thread : threads) thread.start();
        // maintain statistics
        long tinit = (long) time.stop();
        if (DMLScript.STATISTICS)
            Statistics.incrementParForInitTime(tinit);
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, tinit);
        // Step 2) create tasks
        TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
        long numIterations = partitioner.getNumIterations();
        long numCreatedTasks = -1;
        if (USE_STREAMING_TASK_CREATION) {
            // put tasks into queue (parworker start work on first tasks while creating tasks)
            numCreatedTasks = partitioner.createTasks(queue);
        } else {
            List<Task> tasks = partitioner.createTasks();
            numCreatedTasks = tasks.size();
            // put tasks into queue
            for (Task t : tasks) queue.enqueueTask(t);
            // mark end of task input stream
            queue.closeInput();
        }
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
        // Step 3) join all threads (wait for finished work)
        for (Thread thread : threads) thread.join();
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
        // Step 4) collecting results from each parallel worker
        // obtain results and cleanup other intermediates before result merge
        LocalVariableMap[] localVariables = new LocalVariableMap[_numThreads];
        for (int i = 0; i < _numThreads; i++) {
            localVariables[i] = workers[i].getVariables();
            localVariables[i].removeAllNotIn(_resultVars.stream().map(v -> v._name).collect(Collectors.toSet()));
            numExecutedTasks += workers[i].getExecutedTasks();
            numExecutedIterations += workers[i].getExecutedIterations();
        }
        // consolidate results into global symbol table
        consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, localVariables);
        // Step 5) cleanup local parworkers (e.g., remove created functions)
        for (int i = 0; i < _numThreads; i++) {
            Collection<String> fnNames = workers[i].getFunctionNames();
            if (fnNames != null && !fnNames.isEmpty())
                for (String fn : fnNames) {
                    String[] parts = DMLProgram.splitFunctionKey(fn);
                    _prog.removeFunctionProgramBlock(parts[0], parts[1]);
                }
        }
        // the main thread to use the GPUContext
        if (DMLScript.USE_ACCELERATOR) {
            ec.getGPUContext(0).initializeThread();
        }
    } finally {
        // remove thread-local memory budget (reset to original budget)
        // (in finally to prevent error side effects for multiple scripts in one jvm)
        resetMemoryBudget();
        // disable runtime piggybacking
        if (_enableRuntimePiggybacking)
            RuntimePiggybacking.stop();
        if (_monitor) {
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
        }
    }
}
Also used : Task(org.apache.sysml.runtime.controlprogram.parfor.Task) LocalTaskQueue(org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue) LocalParWorker(org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) TaskPartitioner(org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)

Example 3 with LocalParWorker

use of org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker in project incubator-systemml by apache.

the class ParForProgramBlock method createParallelWorker.

/**
	 * Creates a new or partially recycled instance of a parallel worker. Therefore the symbol table, and child
	 * program blocks are deep copied. Note that entries of the symbol table are not deep copied because they are replaced 
	 * anyway on the next write. In case of recycling the deep copies of program blocks are recycled from previous 
	 * executions of this parfor.
	 * 
	 * @param pwID parworker id
	 * @param queue task queue
	 * @param ec execution context
	 * @return local parworker
	 * @throws DMLRuntimeException if DMLRuntimeException occurs
	 */
private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec) throws DMLRuntimeException {
    LocalParWorker pw = null;
    try {
        //create deep copies of required elements child blocks
        ArrayList<ProgramBlock> cpChildBlocks = null;
        HashSet<String> fnNames = new HashSet<String>();
        if (USE_PB_CACHE) {
            if (_pbcache.containsKey(pwID)) {
                cpChildBlocks = _pbcache.get(pwID);
            } else {
                cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
                _pbcache.put(pwID, cpChildBlocks);
            }
        } else {
            cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
        }
        //deep copy execution context (including prepare parfor update-in-place)
        ExecutionContext cpEc = ProgramConverter.createDeepCopyExecutionContext(ec);
        // and sets it in the ExecutionContext
        if (DMLScript.USE_ACCELERATOR) {
            cpEc.setGPUContext(GPUContextPool.getFromPool());
        }
        //prepare basic update-in-place variables (vars dropped on result merge)
        prepareUpdateInPlaceVariables(cpEc, pwID);
        //copy compiler configuration (for jmlc w/o global config)
        CompilerConfig cconf = ConfigurationManager.getCompilerConfig();
        //create the actual parallel worker
        ParForBody body = new ParForBody(cpChildBlocks, _resultVars, cpEc);
        pw = new LocalParWorker(pwID, queue, body, cconf, MAX_RETRYS_ON_ERROR, _monitor);
        pw.setFunctionNames(fnNames);
    } catch (Exception ex) {
        throw new DMLRuntimeException(ex);
    }
    return pw;
}
Also used : ParForBody(org.apache.sysml.runtime.controlprogram.parfor.ParForBody) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) ExecutionContext(org.apache.sysml.runtime.controlprogram.context.ExecutionContext) LocalParWorker(org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker) CompilerConfig(org.apache.sysml.conf.CompilerConfig) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) CacheException(org.apache.sysml.runtime.controlprogram.caching.CacheException) HashSet(java.util.HashSet) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException)

Example 4 with LocalParWorker

use of org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker in project systemml by apache.

the class ParForProgramBlock method executeLocalParFor.

/**
 * Executes the parfor locally, i.e., the parfor is realized with numThreads local threads that drive execution.
 * This execution mode allows for arbitrary nested local parallelism and nested invocations of MR jobs. See
 * below for details of the realization.
 *
 * @param ec execution context
 * @param itervar ?
 * @param from ?
 * @param to ?
 * @param incr ?
 * @throws InterruptedException if InterruptedException occurs
 */
private void executeLocalParFor(ExecutionContext ec, IntObject itervar, IntObject from, IntObject to, IntObject incr) throws InterruptedException {
    LOG.trace("Local Par For (multi-threaded) with degree of parallelism : " + _numThreads);
    /* Step 1) init parallel workers, task queue and threads
		 *         start threads (from now on waiting for tasks)
		 * Step 2) create tasks
		 *         put tasks into queue
		 *         mark end of task input stream
		 * Step 3) join all threads (wait for finished work)
		 * Step 4) collect results from each parallel worker
		 */
    Timing time = new Timing(true);
    int numExecutedTasks = 0;
    int numExecutedIterations = 0;
    // restrict recompilation to thread local memory
    setMemoryBudget();
    // enable runtime piggybacking if required
    if (_enableRuntimePiggybacking)
        // default piggybacking worker
        RuntimePiggybacking.start(_numThreads);
    try {
        // Step 1) create task queue and init workers in parallel
        // (including preparation of update-in-place variables)
        LocalTaskQueue<Task> queue = new LocalTaskQueue<>();
        Thread[] threads = new Thread[_numThreads];
        LocalParWorker[] workers = new LocalParWorker[_numThreads];
        IntStream.range(0, _numThreads).parallel().forEach(i -> {
            workers[i] = createParallelWorker(_pwIDs[i], queue, ec, i);
            threads[i] = new Thread(workers[i]);
            threads[i].setPriority(Thread.MAX_PRIORITY);
        });
        // start threads (from now on waiting for tasks)
        for (Thread thread : threads) thread.start();
        // maintain statistics
        long tinit = (long) time.stop();
        if (DMLScript.STATISTICS)
            Statistics.incrementParForInitTime(tinit);
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_PARWRK_T, tinit);
        // Step 2) create tasks
        TaskPartitioner partitioner = createTaskPartitioner(from, to, incr);
        long numIterations = partitioner.getNumIterations();
        long numCreatedTasks = -1;
        if (USE_STREAMING_TASK_CREATION) {
            // put tasks into queue (parworker start work on first tasks while creating tasks)
            numCreatedTasks = partitioner.createTasks(queue);
        } else {
            List<Task> tasks = partitioner.createTasks();
            numCreatedTasks = tasks.size();
            // put tasks into queue
            for (Task t : tasks) queue.enqueueTask(t);
            // mark end of task input stream
            queue.closeInput();
        }
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_INIT_TASKS_T, time.stop());
        // Step 3) join all threads (wait for finished work)
        for (Thread thread : threads) thread.join();
        if (_monitor)
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_EXEC_T, time.stop());
        // Step 4) collecting results from each parallel worker
        // obtain results and cleanup other intermediates before result merge
        LocalVariableMap[] localVariables = new LocalVariableMap[_numThreads];
        for (int i = 0; i < _numThreads; i++) {
            localVariables[i] = workers[i].getVariables();
            localVariables[i].removeAllNotIn(_resultVars.stream().map(v -> v._name).collect(Collectors.toSet()));
            numExecutedTasks += workers[i].getExecutedTasks();
            numExecutedIterations += workers[i].getExecutedIterations();
        }
        // consolidate results into global symbol table
        consolidateAndCheckResults(ec, numIterations, numCreatedTasks, numExecutedIterations, numExecutedTasks, localVariables);
        // Step 5) cleanup local parworkers (e.g., remove created functions)
        for (int i = 0; i < _numThreads; i++) {
            Collection<String> fnNames = workers[i].getFunctionNames();
            if (fnNames != null && !fnNames.isEmpty())
                for (String fn : fnNames) {
                    String[] parts = DMLProgram.splitFunctionKey(fn);
                    _prog.removeFunctionProgramBlock(parts[0], parts[1]);
                }
        }
        // the main thread to use the GPUContext
        if (DMLScript.USE_ACCELERATOR) {
            ec.getGPUContext(0).initializeThread();
        }
    } finally {
        // remove thread-local memory budget (reset to original budget)
        // (in finally to prevent error side effects for multiple scripts in one jvm)
        resetMemoryBudget();
        // disable runtime piggybacking
        if (_enableRuntimePiggybacking)
            RuntimePiggybacking.stop();
        if (_monitor) {
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_WAIT_RESULTS_T, time.stop());
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMTASKS, numExecutedTasks);
            StatisticMonitor.putPFStat(_ID, Stat.PARFOR_NUMITERS, numExecutedIterations);
        }
    }
}
Also used : Task(org.apache.sysml.runtime.controlprogram.parfor.Task) LocalTaskQueue(org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue) LocalParWorker(org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker) Timing(org.apache.sysml.runtime.controlprogram.parfor.stat.Timing) TaskPartitioner(org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)

Example 5 with LocalParWorker

use of org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker in project systemml by apache.

the class ParForProgramBlock method createParallelWorker.

/**
 * Creates a new or partially recycled instance of a parallel worker. Therefore the symbol table, and child
 * program blocks are deep copied. Note that entries of the symbol table are not deep copied because they are replaced
 * anyway on the next write. In case of recycling the deep copies of program blocks are recycled from previous
 * executions of this parfor.
 *
 * @param pwID parworker id
 * @param queue task queue
 * @param ec execution context
 * @param index the index of the worker
 * @return local parworker
 */
private LocalParWorker createParallelWorker(long pwID, LocalTaskQueue<Task> queue, ExecutionContext ec, int index) {
    LocalParWorker pw = null;
    try {
        // create deep copies of required elements child blocks
        ArrayList<ProgramBlock> cpChildBlocks = null;
        HashSet<String> fnNames = new HashSet<>();
        if (USE_PB_CACHE) {
            if (_pbcache.containsKey(pwID)) {
                cpChildBlocks = _pbcache.get(pwID);
            } else {
                cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
                _pbcache.put(pwID, cpChildBlocks);
            }
        } else {
            cpChildBlocks = ProgramConverter.rcreateDeepCopyProgramBlocks(_childBlocks, pwID, _IDPrefix, new HashSet<String>(), fnNames, false, false);
        }
        // deep copy execution context (including prepare parfor update-in-place)
        ExecutionContext cpEc = ProgramConverter.createDeepCopyExecutionContext(ec);
        // and sets it in the ExecutionContext of the parfor
        if (DMLScript.USE_ACCELERATOR) {
            cpEc.setGPUContexts(Arrays.asList(ec.getGPUContext(index)));
        }
        // prepare basic update-in-place variables (vars dropped on result merge)
        prepareUpdateInPlaceVariables(cpEc, pwID);
        // copy compiler configuration (for jmlc w/o global config)
        CompilerConfig cconf = ConfigurationManager.getCompilerConfig();
        // create the actual parallel worker
        ParForBody body = new ParForBody(cpChildBlocks, _resultVars, cpEc);
        pw = new LocalParWorker(pwID, queue, body, cconf, MAX_RETRYS_ON_ERROR, _monitor);
        pw.setFunctionNames(fnNames);
    } catch (Exception ex) {
        throw new RuntimeException(ex);
    }
    return pw;
}
Also used : ParForBody(org.apache.sysml.runtime.controlprogram.parfor.ParForBody) SparkExecutionContext(org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext) ExecutionContext(org.apache.sysml.runtime.controlprogram.context.ExecutionContext) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) LocalParWorker(org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker) CompilerConfig(org.apache.sysml.conf.CompilerConfig) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) HashSet(java.util.HashSet)

Aggregations

LocalParWorker (org.apache.sysml.runtime.controlprogram.parfor.LocalParWorker)5 IOException (java.io.IOException)3 HashSet (java.util.HashSet)3 CompilerConfig (org.apache.sysml.conf.CompilerConfig)3 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)3 ExecutionContext (org.apache.sysml.runtime.controlprogram.context.ExecutionContext)3 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)3 ParForBody (org.apache.sysml.runtime.controlprogram.parfor.ParForBody)3 LocalTaskQueue (org.apache.sysml.runtime.controlprogram.parfor.LocalTaskQueue)2 Task (org.apache.sysml.runtime.controlprogram.parfor.Task)2 TaskPartitioner (org.apache.sysml.runtime.controlprogram.parfor.TaskPartitioner)2 Timing (org.apache.sysml.runtime.controlprogram.parfor.stat.Timing)2 CacheException (org.apache.sysml.runtime.controlprogram.caching.CacheException)1