use of org.apache.sysml.runtime.controlprogram.ParForProgramBlock in project incubator-systemml by apache.
the class OptimizerRuleBased method rewriteSetResultMerge.
// /////
// REWRITE set result merge
// /
protected void rewriteSetResultMerge(OptNode n, LocalVariableMap vars, boolean inLocal) {
ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping().getMappedProg(n.getID())[1];
PResultMerge REMOTE = OptimizerUtils.isSparkExecutionMode() ? PResultMerge.REMOTE_SPARK : PResultMerge.REMOTE_MR;
PResultMerge ret = null;
// investigate details of current parfor node
boolean flagRemoteParFOR = (n.getExecType() == getRemoteExecType());
boolean flagLargeResult = hasLargeTotalResults(n, pfpb.getResultVariables(), vars, true);
boolean flagRemoteLeftIndexing = hasResultMRLeftIndexing(n, pfpb.getResultVariables(), vars, true);
boolean flagCellFormatWoCompare = determineFlagCellFormatWoCompare(pfpb.getResultVariables(), vars);
boolean flagOnlyInMemResults = hasOnlyInMemoryResults(n, pfpb.getResultVariables(), vars, true);
// MR, if remote exec, and w/compare (prevent huge transfer/merge costs)
if (flagRemoteParFOR && flagLargeResult) {
ret = REMOTE;
} else // CP, if all results in mem
if (flagOnlyInMemResults) {
ret = PResultMerge.LOCAL_MEM;
} else // benefit for large matrices outweigths potentially unnecessary MR jobs for smaller matrices)
if ((flagRemoteParFOR || flagRemoteLeftIndexing) && !(flagCellFormatWoCompare && ResultMergeLocalFile.ALLOW_COPY_CELLFILES)) {
ret = REMOTE;
} else // CP, otherwise (decide later if in mem or file-based)
{
ret = PResultMerge.LOCAL_AUTOMATIC;
}
// modify rtprog
pfpb.setResultMerge(ret);
// modify plan
n.addParam(ParamType.RESULT_MERGE, ret.toString());
// recursively apply rewrite for parfor nodes
if (n.getChilds() != null)
rInvokeSetResultMerge(n.getChilds(), vars, inLocal && !flagRemoteParFOR);
_numEvaluatedPlans++;
LOG.debug(getOptMode() + " OPT: rewrite 'set result merge' - result=" + ret);
}
use of org.apache.sysml.runtime.controlprogram.ParForProgramBlock in project incubator-systemml by apache.
the class OptimizerRuleBased method rewriteSetExportReplicationFactor.
// /////
// REWRITE set export replication factor
// /
/**
* Increasing the export replication factor is beneficial for remote execution
* because each task will read the full input data set. This only applies to
* matrices that are created as in-memory objects before parfor execution.
*
* NOTE: this rewrite requires 'set execution strategy' to be executed.
*
* @param n internal representation of a plan alternative for program blocks and instructions
* @param vars local variable map
*/
protected void rewriteSetExportReplicationFactor(OptNode n, LocalVariableMap vars) {
boolean apply = false;
int replication = -1;
ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping().getMappedProg(n.getID())[1];
// decide on the replication factor
if (n.getExecType() == getRemoteExecType()) {
apply = true;
// account for problem and cluster constraints
replication = (int) Math.min(_N, _rnk);
// account for internal max constraint (note hadoop will warn if max > 10)
replication = (int) Math.min(replication, MAX_REPLICATION_FACTOR_EXPORT);
}
// modify the runtime plan
if (apply)
pfpb.setExportReplicationFactor(replication);
_numEvaluatedPlans++;
LOG.debug(getOptMode() + " OPT: rewrite 'set export replication factor' - result=" + apply + ((apply) ? " (" + replication + ")" : ""));
}
use of org.apache.sysml.runtime.controlprogram.ParForProgramBlock in project incubator-systemml by apache.
the class OptimizerRuleBased method rewriteSetResultPartitioning.
// /////
// REWRITE set result partitioning
// /
protected boolean rewriteSetResultPartitioning(OptNode n, double M, LocalVariableMap vars) {
// preparations
long id = n.getID();
Object[] o = OptTreeConverter.getAbstractPlanMapping().getMappedProg(id);
ParForProgramBlock pfpb = (ParForProgramBlock) o[1];
// search for candidates
Collection<OptNode> cand = n.getNodeList(getRemoteExecType());
// determine if applicable
boolean apply = // ops fit in remote memory budget
M < _rm && // at least one MR
!cand.isEmpty() && isResultPartitionableAll(cand, pfpb.getResultVariables(), vars, // check candidates
pfpb.getIterVar());
// recompile LIX
if (apply) {
try {
for (OptNode lix : cand) recompileLIX(lix, vars);
} catch (Exception ex) {
throw new DMLRuntimeException("Unable to recompile LIX.", ex);
}
}
_numEvaluatedPlans++;
LOG.debug(getOptMode() + " OPT: rewrite 'set result partitioning' - result=" + apply);
return apply;
}
use of org.apache.sysml.runtime.controlprogram.ParForProgramBlock in project incubator-systemml by apache.
the class OptimizerRuleBased method rewriteSetDegreeOfParallelism.
// /////
// REWRITE set degree of parallelism
// /
protected void rewriteSetDegreeOfParallelism(OptNode n, double M, boolean flagNested) {
ExecType type = n.getExecType();
long id = n.getID();
// special handling for different exec models (CP, MR, MR nested)
ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping().getMappedProg(id)[1];
if (type == ExecType.CP) {
// determine local max parallelism constraint
int kMax = ConfigurationManager.isParallelParFor() ? (n.isCPOnly() ? _lkmaxCP : _lkmaxMR) : 1;
// ensure local memory constraint (for spark more conservative in order to
// prevent unnecessary guarded collect)
double mem = (OptimizerUtils.isSparkExecutionMode() && !n.isCPOnly()) ? _lm / 2 : _lm;
kMax = Math.min(kMax, (int) Math.floor(mem / M));
kMax = Math.max(kMax, 1);
// constrain max parfor parallelism by problem size
int parforK = (int) ((_N < kMax) ? _N : kMax);
// FIXME rework for nested parfor parallelism and body w/o gpu ops
if (DMLScript.USE_ACCELERATOR) {
long perGPUBudget = GPUContextPool.initialGPUMemBudget();
double maxMemUsage = getMaxCPOnlyBudget(n);
if (maxMemUsage < perGPUBudget) {
parforK = GPUContextPool.getDeviceCount();
parforK = Math.min(parforK, (int) _N);
LOG.debug("Setting degree of parallelism + [" + parforK + "] for GPU; per GPU budget :[" + perGPUBudget + "], parfor budget :[" + maxMemUsage + "], max parallelism per GPU : [" + parforK + "]");
}
}
// set parfor degree of parallelism
pfpb.setDegreeOfParallelism(parforK);
n.setK(parforK);
// distribute remaining parallelism
int remainParforK = getRemainingParallelismParFor(kMax, parforK);
int remainOpsK = getRemainingParallelismOps(_lkmaxCP, parforK);
rAssignRemainingParallelism(n, remainParforK, remainOpsK);
} else // ExecType.MR/ExecType.SPARK
{
int kMax = -1;
if (flagNested) {
// determine remote max parallelism constraint
// guaranteed <= _N (see nested)
pfpb.setDegreeOfParallelism(_rnk);
n.setK(_rnk);
// per node (CP only inside)
kMax = _rkmax / _rnk;
} else // not nested (default)
{
// determine remote max parallelism constraint
int tmpK = (int) ((_N < _rk) ? _N : _rk);
pfpb.setDegreeOfParallelism(tmpK);
n.setK(tmpK);
// per node (CP only inside)
kMax = _rkmax / tmpK;
}
// ensure remote memory constraint
// guaranteed >= 1 (see exec strategy)
kMax = Math.min(kMax, (int) Math.floor(_rm / M));
if (kMax < 1)
kMax = 1;
// disable nested parallelism, if required
if (!ALLOW_REMOTE_NESTED_PARALLELISM)
kMax = 1;
// distribute remaining parallelism and recompile parallel instructions
rAssignRemainingParallelism(n, kMax, 1);
}
_numEvaluatedPlans++;
LOG.debug(getOptMode() + " OPT: rewrite 'set degree of parallelism' - result=(see EXPLAIN)");
}
use of org.apache.sysml.runtime.controlprogram.ParForProgramBlock in project incubator-systemml by apache.
the class ProgramConverter method createDeepCopyParForProgramBlock.
public static ParForProgramBlock createDeepCopyParForProgramBlock(ParForProgramBlock pfpb, long pid, int IDPrefix, Program prog, HashSet<String> fnStack, HashSet<String> fnCreated, boolean plain, boolean forceDeepCopy) {
ParForProgramBlock tmpPB = null;
if (// still on master node
IDPrefix == -1)
tmpPB = new ParForProgramBlock(prog, pfpb.getIterVar(), pfpb.getParForParams(), pfpb.getResultVariables());
else
// child of remote ParWorker at any level
tmpPB = new ParForProgramBlock(IDPrefix, prog, pfpb.getIterVar(), pfpb.getParForParams(), pfpb.getResultVariables());
tmpPB.setStatementBlock(createForStatementBlockCopy((ForStatementBlock) pfpb.getStatementBlock(), pid, plain, forceDeepCopy));
tmpPB.setThreadID(pid);
// already done in top-level parfor
tmpPB.disableOptimization();
// already done in top-level parfor
tmpPB.disableMonitorReport();
tmpPB.setFromInstructions(createDeepCopyInstructionSet(pfpb.getFromInstructions(), pid, IDPrefix, prog, fnStack, fnCreated, plain, true));
tmpPB.setToInstructions(createDeepCopyInstructionSet(pfpb.getToInstructions(), pid, IDPrefix, prog, fnStack, fnCreated, plain, true));
tmpPB.setIncrementInstructions(createDeepCopyInstructionSet(pfpb.getIncrementInstructions(), pid, IDPrefix, prog, fnStack, fnCreated, plain, true));
tmpPB.setExitInstructions(createDeepCopyInstructionSet(pfpb.getExitInstructions(), pid, IDPrefix, prog, fnStack, fnCreated, plain, true));
// and (2) leave placeholders as they are. However, if plain, an explicit deep copy is requested.
if (plain || forceDeepCopy)
tmpPB.setChildBlocks(rcreateDeepCopyProgramBlocks(pfpb.getChildBlocks(), pid, IDPrefix, fnStack, fnCreated, plain, forceDeepCopy));
else
tmpPB.setChildBlocks(pfpb.getChildBlocks());
return tmpPB;
}
Aggregations