use of org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType in project incubator-systemml by apache.
the class OptimizerConstrained method optimize.
/**
* Main optimization procedure.
*
* Transformation-based heuristic (rule-based) optimization
* (no use of sb, direct change of pb).
*/
@Override
public boolean optimize(ParForStatementBlock sb, ParForProgramBlock pb, OptTree plan, CostEstimator est, ExecutionContext ec) {
LOG.debug("--- " + getOptMode() + " OPTIMIZER -------");
OptNode pn = plan.getRoot();
// early abort for empty parfor body
if (pn.isLeaf())
return true;
// ANALYZE infrastructure properties
super.analyzeProblemAndInfrastructure(pn);
_cost = est;
// debug and warnings output
LOG.debug(getOptMode() + " OPT: Optimize with local_max_mem=" + toMB(_lm) + " and remote_max_mem=" + toMB(_rm) + ").");
if (_rnk <= 0 || _rk <= 0)
LOG.warn(getOptMode() + " OPT: Optimize for inactive cluster (num_nodes=" + _rnk + ", num_map_slots=" + _rk + ").");
// ESTIMATE memory consumption
ExecType oldET = pn.getExecType();
int oldK = pn.getK();
// for basic mem consumption
pn.setSerialParFor();
double M0a = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
pn.setExecType(oldET);
pn.setK(oldK);
LOG.debug(getOptMode() + " OPT: estimated mem (serial exec) M=" + toMB(M0a));
// OPTIMIZE PARFOR PLAN
// rewrite 1: data partitioning (incl. log. recompile RIX)
HashMap<String, PartitionFormat> partitionedMatrices = new HashMap<>();
rewriteSetDataPartitioner(pn, ec.getVariables(), partitionedMatrices, OptimizerUtils.getLocalMemBudget());
// reestimate
double M0b = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
// rewrite 2: remove unnecessary compare matrix
rewriteRemoveUnnecessaryCompareMatrix(pn, ec);
// rewrite 3: rewrite result partitioning (incl. log/phy recompile LIX)
boolean flagLIX = super.rewriteSetResultPartitioning(pn, M0b, ec.getVariables());
// reestimate
double M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
LOG.debug(getOptMode() + " OPT: estimated new mem (serial exec) M=" + toMB(M1));
// determine memory consumption for what-if: all-cp or partitioned
double M2 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn, LopProperties.ExecType.CP);
LOG.debug(getOptMode() + " OPT: estimated new mem (serial exec, all CP) M=" + toMB(M2));
double M3 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn, true);
LOG.debug(getOptMode() + " OPT: estimated new mem (cond partitioning) M=" + toMB(M3));
// rewrite 4: execution strategy
// keep old
PExecMode tmpmode = getPExecMode(pn);
boolean flagRecompMR = rewriteSetExecutionStategy(pn, M0a, M1, M2, M3, flagLIX);
// exec-type-specific rewrites
if (pn.getExecType() == getRemoteExecType()) {
if (M1 > _rm && M3 <= _rm) {
// rewrite 1: data partitioning (apply conditional partitioning)
rewriteSetDataPartitioner(pn, ec.getVariables(), partitionedMatrices, M3);
// reestimate
M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
}
if (flagRecompMR) {
// rewrite 5: set operations exec type
rewriteSetOperationsExecType(pn, flagRecompMR);
// reestimate
M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
}
// rewrite 6: data colocation
super.rewriteDataColocation(pn, ec.getVariables());
// rewrite 7: rewrite set partition replication factor
super.rewriteSetPartitionReplicationFactor(pn, partitionedMatrices, ec.getVariables());
// rewrite 8: rewrite set partition replication factor
super.rewriteSetExportReplicationFactor(pn, ec.getVariables());
// rewrite 10: determine parallelism
rewriteSetDegreeOfParallelism(pn, M1, false);
// rewrite 11: task partitioning
rewriteSetTaskPartitioner(pn, false, flagLIX);
// rewrite 12: fused data partitioning and execution
rewriteSetFusedDataPartitioningExecution(pn, M1, flagLIX, partitionedMatrices, ec.getVariables(), tmpmode);
// rewrite 13: transpose sparse vector operations
super.rewriteSetTranposeSparseVectorOperations(pn, partitionedMatrices, ec.getVariables());
// rewrite 14:
HashSet<ResultVar> inplaceResultVars = new HashSet<>();
super.rewriteSetInPlaceResultIndexing(pn, M1, ec.getVariables(), inplaceResultVars, ec);
// rewrite 15:
super.rewriteDisableCPCaching(pn, inplaceResultVars, ec.getVariables());
} else // if( pn.getExecType() == ExecType.CP )
{
// rewrite 10: determine parallelism
rewriteSetDegreeOfParallelism(pn, M1, false);
// rewrite 11: task partitioning
// flagLIX always false
rewriteSetTaskPartitioner(pn, false, false);
// rewrite 14: set in-place result indexing
HashSet<ResultVar> inplaceResultVars = new HashSet<>();
super.rewriteSetInPlaceResultIndexing(pn, M1, ec.getVariables(), inplaceResultVars, ec);
if (!OptimizerUtils.isSparkExecutionMode()) {
// rewrite 16: runtime piggybacking
super.rewriteEnableRuntimePiggybacking(pn, ec.getVariables(), partitionedMatrices);
} else {
// rewrite 17: checkpoint injection for parfor loop body
super.rewriteInjectSparkLoopCheckpointing(pn);
// rewrite 18: repartition read-only inputs for zipmm
super.rewriteInjectSparkRepartition(pn, ec.getVariables());
// rewrite 19: eager caching for checkpoint rdds
super.rewriteSetSparkEagerRDDCaching(pn, ec.getVariables());
}
}
// rewrite 20: set result merge
rewriteSetResultMerge(pn, ec.getVariables(), true);
// rewrite 21: set local recompile memory budget
super.rewriteSetRecompileMemoryBudget(pn);
// /////
// Final rewrites for cleanup / minor improvements
// rewrite 22: parfor (in recursive functions) to for
super.rewriteRemoveRecursiveParFor(pn, ec.getVariables());
// rewrite 23: parfor (par=1) to for
super.rewriteRemoveUnnecessaryParFor(pn);
// info optimization result
_numEvaluatedPlans = 1;
return true;
}
use of org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType in project incubator-systemml by apache.
the class OptimizerRuleBased method isCPOnlyPossible.
protected boolean isCPOnlyPossible(OptNode n, double memBudget) {
ExecType et = n.getExecType();
boolean ret = (et == ExecType.CP);
if (n.isLeaf() && et == getRemoteExecType()) {
Hop h = OptTreeConverter.getAbstractPlanMapping().getMappedHop(n.getID());
if (// e.g., -exec=hadoop
h.getForcedExecType() != LopProperties.ExecType.MR && h.getForcedExecType() != LopProperties.ExecType.SPARK && // integer dims
h.hasValidCPDimsAndSize()) {
double mem = _cost.getLeafNodeEstimate(TestMeasure.MEMORY_USAGE, n, LopProperties.ExecType.CP);
if (mem <= memBudget)
ret = true;
}
}
if (!n.isLeaf())
for (OptNode c : n.getChilds()) {
// early abort if already false
if (!ret)
break;
ret &= isCPOnlyPossible(c, memBudget);
}
return ret;
}
use of org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType in project incubator-systemml by apache.
the class OptimizerRuleBased method rewriteSetDegreeOfParallelism.
// /////
// REWRITE set degree of parallelism
// /
protected void rewriteSetDegreeOfParallelism(OptNode n, double M, boolean flagNested) {
ExecType type = n.getExecType();
long id = n.getID();
// special handling for different exec models (CP, MR, MR nested)
ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping().getMappedProg(id)[1];
if (type == ExecType.CP) {
// determine local max parallelism constraint
int kMax = ConfigurationManager.isParallelParFor() ? (n.isCPOnly() ? _lkmaxCP : _lkmaxMR) : 1;
// ensure local memory constraint (for spark more conservative in order to
// prevent unnecessary guarded collect)
double mem = (OptimizerUtils.isSparkExecutionMode() && !n.isCPOnly()) ? _lm / 2 : _lm;
kMax = Math.min(kMax, (int) Math.floor(mem / M));
kMax = Math.max(kMax, 1);
// constrain max parfor parallelism by problem size
int parforK = (int) ((_N < kMax) ? _N : kMax);
// FIXME rework for nested parfor parallelism and body w/o gpu ops
if (DMLScript.USE_ACCELERATOR) {
long perGPUBudget = GPUContextPool.initialGPUMemBudget();
double maxMemUsage = getMaxCPOnlyBudget(n);
if (maxMemUsage < perGPUBudget) {
parforK = GPUContextPool.getDeviceCount();
parforK = Math.min(parforK, (int) _N);
LOG.debug("Setting degree of parallelism + [" + parforK + "] for GPU; per GPU budget :[" + perGPUBudget + "], parfor budget :[" + maxMemUsage + "], max parallelism per GPU : [" + parforK + "]");
}
}
// set parfor degree of parallelism
pfpb.setDegreeOfParallelism(parforK);
n.setK(parforK);
// distribute remaining parallelism
int remainParforK = getRemainingParallelismParFor(kMax, parforK);
int remainOpsK = getRemainingParallelismOps(_lkmaxCP, parforK);
rAssignRemainingParallelism(n, remainParforK, remainOpsK);
} else // ExecType.MR/ExecType.SPARK
{
int kMax = -1;
if (flagNested) {
// determine remote max parallelism constraint
// guaranteed <= _N (see nested)
pfpb.setDegreeOfParallelism(_rnk);
n.setK(_rnk);
// per node (CP only inside)
kMax = _rkmax / _rnk;
} else // not nested (default)
{
// determine remote max parallelism constraint
int tmpK = (int) ((_N < _rk) ? _N : _rk);
pfpb.setDegreeOfParallelism(tmpK);
n.setK(tmpK);
// per node (CP only inside)
kMax = _rkmax / tmpK;
}
// ensure remote memory constraint
// guaranteed >= 1 (see exec strategy)
kMax = Math.min(kMax, (int) Math.floor(_rm / M));
if (kMax < 1)
kMax = 1;
// disable nested parallelism, if required
if (!ALLOW_REMOTE_NESTED_PARALLELISM)
kMax = 1;
// distribute remaining parallelism and recompile parallel instructions
rAssignRemainingParallelism(n, kMax, 1);
}
_numEvaluatedPlans++;
LOG.debug(getOptMode() + " OPT: rewrite 'set degree of parallelism' - result=(see EXPLAIN)");
}
use of org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType in project systemml by apache.
the class OptimizerRuleBased method getMaxCPOnlyBudget.
/**
* Calculates the maximum memory needed in a CP only Parfor
* based on the {@link Hop#computeMemEstimate(MemoTable)} } function
* called recursively for the "children" of the parfor {@link OptNode}.
*
* @param n the parfor {@link OptNode}
* @return the maximum memory needed for any operation inside a parfor in CP execution mode
*/
protected double getMaxCPOnlyBudget(OptNode n) {
ExecType et = n.getExecType();
double ret = 0;
if (n.isLeaf() && et != getRemoteExecType()) {
Hop h = OptTreeConverter.getAbstractPlanMapping().getMappedHop(n.getID());
if (// e.g., -exec=hadoop
h.getForcedExecType() != LopProperties.ExecType.MR && h.getForcedExecType() != LopProperties.ExecType.SPARK) {
double mem = _cost.getLeafNodeEstimate(TestMeasure.MEMORY_USAGE, n, LopProperties.ExecType.CP);
if (mem >= OptimizerUtils.DEFAULT_SIZE) {
// memory estimate for worst case scenario.
// optimistically ignoring this
} else {
ret = Math.max(ret, mem);
}
}
}
if (!n.isLeaf()) {
for (OptNode c : n.getChilds()) {
ret = Math.max(ret, getMaxCPOnlyBudget(c));
}
}
return ret;
}
use of org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType in project systemml by apache.
the class OptimizerConstrained method optimize.
/**
* Main optimization procedure.
*
* Transformation-based heuristic (rule-based) optimization
* (no use of sb, direct change of pb).
*/
@Override
public boolean optimize(ParForStatementBlock sb, ParForProgramBlock pb, OptTree plan, CostEstimator est, ExecutionContext ec) {
LOG.debug("--- " + getOptMode() + " OPTIMIZER -------");
OptNode pn = plan.getRoot();
// early abort for empty parfor body
if (pn.isLeaf())
return true;
// ANALYZE infrastructure properties
super.analyzeProblemAndInfrastructure(pn);
_cost = est;
// debug and warnings output
LOG.debug(getOptMode() + " OPT: Optimize with local_max_mem=" + toMB(_lm) + " and remote_max_mem=" + toMB(_rm) + ").");
if (_rnk <= 0 || _rk <= 0)
LOG.warn(getOptMode() + " OPT: Optimize for inactive cluster (num_nodes=" + _rnk + ", num_map_slots=" + _rk + ").");
// ESTIMATE memory consumption
ExecType oldET = pn.getExecType();
int oldK = pn.getK();
// for basic mem consumption
pn.setSerialParFor();
double M0a = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
pn.setExecType(oldET);
pn.setK(oldK);
LOG.debug(getOptMode() + " OPT: estimated mem (serial exec) M=" + toMB(M0a));
// OPTIMIZE PARFOR PLAN
// rewrite 1: data partitioning (incl. log. recompile RIX)
HashMap<String, PartitionFormat> partitionedMatrices = new HashMap<>();
rewriteSetDataPartitioner(pn, ec.getVariables(), partitionedMatrices, OptimizerUtils.getLocalMemBudget());
// reestimate
double M0b = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
// rewrite 2: remove unnecessary compare matrix
rewriteRemoveUnnecessaryCompareMatrix(pn, ec);
// rewrite 3: rewrite result partitioning (incl. log/phy recompile LIX)
boolean flagLIX = super.rewriteSetResultPartitioning(pn, M0b, ec.getVariables());
// reestimate
double M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
LOG.debug(getOptMode() + " OPT: estimated new mem (serial exec) M=" + toMB(M1));
// determine memory consumption for what-if: all-cp or partitioned
double M2 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn, LopProperties.ExecType.CP);
LOG.debug(getOptMode() + " OPT: estimated new mem (serial exec, all CP) M=" + toMB(M2));
double M3 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn, true);
LOG.debug(getOptMode() + " OPT: estimated new mem (cond partitioning) M=" + toMB(M3));
// rewrite 4: execution strategy
// keep old
PExecMode tmpmode = getPExecMode(pn);
boolean flagRecompMR = rewriteSetExecutionStategy(pn, M0a, M1, M2, M3, flagLIX);
// exec-type-specific rewrites
if (pn.getExecType() == getRemoteExecType()) {
if (M1 > _rm && M3 <= _rm) {
// rewrite 1: data partitioning (apply conditional partitioning)
rewriteSetDataPartitioner(pn, ec.getVariables(), partitionedMatrices, M3);
// reestimate
M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
}
if (flagRecompMR) {
// rewrite 5: set operations exec type
rewriteSetOperationsExecType(pn, flagRecompMR);
// reestimate
M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
}
// rewrite 6: data colocation
super.rewriteDataColocation(pn, ec.getVariables());
// rewrite 7: rewrite set partition replication factor
super.rewriteSetPartitionReplicationFactor(pn, partitionedMatrices, ec.getVariables());
// rewrite 8: rewrite set partition replication factor
super.rewriteSetExportReplicationFactor(pn, ec.getVariables());
// rewrite 10: determine parallelism
rewriteSetDegreeOfParallelism(pn, M1, false);
// rewrite 11: task partitioning
rewriteSetTaskPartitioner(pn, false, flagLIX);
// rewrite 12: fused data partitioning and execution
rewriteSetFusedDataPartitioningExecution(pn, M1, flagLIX, partitionedMatrices, ec.getVariables(), tmpmode);
// rewrite 13: transpose sparse vector operations
super.rewriteSetTranposeSparseVectorOperations(pn, partitionedMatrices, ec.getVariables());
// rewrite 14:
HashSet<ResultVar> inplaceResultVars = new HashSet<>();
super.rewriteSetInPlaceResultIndexing(pn, M1, ec.getVariables(), inplaceResultVars, ec);
// rewrite 15:
super.rewriteDisableCPCaching(pn, inplaceResultVars, ec.getVariables());
} else // if( pn.getExecType() == ExecType.CP )
{
// rewrite 10: determine parallelism
rewriteSetDegreeOfParallelism(pn, M1, false);
// rewrite 11: task partitioning
// flagLIX always false
rewriteSetTaskPartitioner(pn, false, false);
// rewrite 14: set in-place result indexing
HashSet<ResultVar> inplaceResultVars = new HashSet<>();
super.rewriteSetInPlaceResultIndexing(pn, M1, ec.getVariables(), inplaceResultVars, ec);
if (!OptimizerUtils.isSparkExecutionMode()) {
// rewrite 16: runtime piggybacking
super.rewriteEnableRuntimePiggybacking(pn, ec.getVariables(), partitionedMatrices);
} else {
// rewrite 17: checkpoint injection for parfor loop body
super.rewriteInjectSparkLoopCheckpointing(pn);
// rewrite 18: repartition read-only inputs for zipmm
super.rewriteInjectSparkRepartition(pn, ec.getVariables());
// rewrite 19: eager caching for checkpoint rdds
super.rewriteSetSparkEagerRDDCaching(pn, ec.getVariables());
}
}
// rewrite 20: set result merge
rewriteSetResultMerge(pn, ec.getVariables(), true);
// rewrite 21: set local recompile memory budget
super.rewriteSetRecompileMemoryBudget(pn);
// /////
// Final rewrites for cleanup / minor improvements
// rewrite 22: parfor (in recursive functions) to for
super.rewriteRemoveRecursiveParFor(pn, ec.getVariables());
// rewrite 23: parfor (par=1) to for
super.rewriteRemoveUnnecessaryParFor(pn);
// info optimization result
_numEvaluatedPlans = 1;
return true;
}
Aggregations