Search in sources :

Example 1 with ExecType

use of org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType in project incubator-systemml by apache.

the class OptimizerConstrained method optimize.

/**
 * Main optimization procedure.
 *
 * Transformation-based heuristic (rule-based) optimization
 * (no use of sb, direct change of pb).
 */
@Override
public boolean optimize(ParForStatementBlock sb, ParForProgramBlock pb, OptTree plan, CostEstimator est, ExecutionContext ec) {
    LOG.debug("--- " + getOptMode() + " OPTIMIZER -------");
    OptNode pn = plan.getRoot();
    // early abort for empty parfor body
    if (pn.isLeaf())
        return true;
    // ANALYZE infrastructure properties
    super.analyzeProblemAndInfrastructure(pn);
    _cost = est;
    // debug and warnings output
    LOG.debug(getOptMode() + " OPT: Optimize with local_max_mem=" + toMB(_lm) + " and remote_max_mem=" + toMB(_rm) + ").");
    if (_rnk <= 0 || _rk <= 0)
        LOG.warn(getOptMode() + " OPT: Optimize for inactive cluster (num_nodes=" + _rnk + ", num_map_slots=" + _rk + ").");
    // ESTIMATE memory consumption
    ExecType oldET = pn.getExecType();
    int oldK = pn.getK();
    // for basic mem consumption
    pn.setSerialParFor();
    double M0a = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
    pn.setExecType(oldET);
    pn.setK(oldK);
    LOG.debug(getOptMode() + " OPT: estimated mem (serial exec) M=" + toMB(M0a));
    // OPTIMIZE PARFOR PLAN
    // rewrite 1: data partitioning (incl. log. recompile RIX)
    HashMap<String, PartitionFormat> partitionedMatrices = new HashMap<>();
    rewriteSetDataPartitioner(pn, ec.getVariables(), partitionedMatrices, OptimizerUtils.getLocalMemBudget());
    // reestimate
    double M0b = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
    // rewrite 2: remove unnecessary compare matrix
    rewriteRemoveUnnecessaryCompareMatrix(pn, ec);
    // rewrite 3: rewrite result partitioning (incl. log/phy recompile LIX)
    boolean flagLIX = super.rewriteSetResultPartitioning(pn, M0b, ec.getVariables());
    // reestimate
    double M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
    LOG.debug(getOptMode() + " OPT: estimated new mem (serial exec) M=" + toMB(M1));
    // determine memory consumption for what-if: all-cp or partitioned
    double M2 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn, LopProperties.ExecType.CP);
    LOG.debug(getOptMode() + " OPT: estimated new mem (serial exec, all CP) M=" + toMB(M2));
    double M3 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn, true);
    LOG.debug(getOptMode() + " OPT: estimated new mem (cond partitioning) M=" + toMB(M3));
    // rewrite 4: execution strategy
    // keep old
    PExecMode tmpmode = getPExecMode(pn);
    boolean flagRecompMR = rewriteSetExecutionStategy(pn, M0a, M1, M2, M3, flagLIX);
    // exec-type-specific rewrites
    if (pn.getExecType() == getRemoteExecType()) {
        if (M1 > _rm && M3 <= _rm) {
            // rewrite 1: data partitioning (apply conditional partitioning)
            rewriteSetDataPartitioner(pn, ec.getVariables(), partitionedMatrices, M3);
            // reestimate
            M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
        }
        if (flagRecompMR) {
            // rewrite 5: set operations exec type
            rewriteSetOperationsExecType(pn, flagRecompMR);
            // reestimate
            M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
        }
        // rewrite 6: data colocation
        super.rewriteDataColocation(pn, ec.getVariables());
        // rewrite 7: rewrite set partition replication factor
        super.rewriteSetPartitionReplicationFactor(pn, partitionedMatrices, ec.getVariables());
        // rewrite 8: rewrite set partition replication factor
        super.rewriteSetExportReplicationFactor(pn, ec.getVariables());
        // rewrite 10: determine parallelism
        rewriteSetDegreeOfParallelism(pn, M1, false);
        // rewrite 11: task partitioning
        rewriteSetTaskPartitioner(pn, false, flagLIX);
        // rewrite 12: fused data partitioning and execution
        rewriteSetFusedDataPartitioningExecution(pn, M1, flagLIX, partitionedMatrices, ec.getVariables(), tmpmode);
        // rewrite 13: transpose sparse vector operations
        super.rewriteSetTranposeSparseVectorOperations(pn, partitionedMatrices, ec.getVariables());
        // rewrite 14:
        HashSet<ResultVar> inplaceResultVars = new HashSet<>();
        super.rewriteSetInPlaceResultIndexing(pn, M1, ec.getVariables(), inplaceResultVars, ec);
        // rewrite 15:
        super.rewriteDisableCPCaching(pn, inplaceResultVars, ec.getVariables());
    } else // if( pn.getExecType() == ExecType.CP )
    {
        // rewrite 10: determine parallelism
        rewriteSetDegreeOfParallelism(pn, M1, false);
        // rewrite 11: task partitioning
        // flagLIX always false
        rewriteSetTaskPartitioner(pn, false, false);
        // rewrite 14: set in-place result indexing
        HashSet<ResultVar> inplaceResultVars = new HashSet<>();
        super.rewriteSetInPlaceResultIndexing(pn, M1, ec.getVariables(), inplaceResultVars, ec);
        if (!OptimizerUtils.isSparkExecutionMode()) {
            // rewrite 16: runtime piggybacking
            super.rewriteEnableRuntimePiggybacking(pn, ec.getVariables(), partitionedMatrices);
        } else {
            // rewrite 17: checkpoint injection for parfor loop body
            super.rewriteInjectSparkLoopCheckpointing(pn);
            // rewrite 18: repartition read-only inputs for zipmm
            super.rewriteInjectSparkRepartition(pn, ec.getVariables());
            // rewrite 19: eager caching for checkpoint rdds
            super.rewriteSetSparkEagerRDDCaching(pn, ec.getVariables());
        }
    }
    // rewrite 20: set result merge
    rewriteSetResultMerge(pn, ec.getVariables(), true);
    // rewrite 21: set local recompile memory budget
    super.rewriteSetRecompileMemoryBudget(pn);
    // /////
    // Final rewrites for cleanup / minor improvements
    // rewrite 22: parfor (in recursive functions) to for
    super.rewriteRemoveRecursiveParFor(pn, ec.getVariables());
    // rewrite 23: parfor (par=1) to for
    super.rewriteRemoveUnnecessaryParFor(pn);
    // info optimization result
    _numEvaluatedPlans = 1;
    return true;
}
Also used : HashMap(java.util.HashMap) PDataPartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat) PartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PartitionFormat) PExecMode(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PExecMode) ResultVar(org.apache.sysml.parser.ParForStatementBlock.ResultVar) ExecType(org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType) HashSet(java.util.HashSet)

Example 2 with ExecType

use of org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType in project incubator-systemml by apache.

the class OptimizerRuleBased method isCPOnlyPossible.

protected boolean isCPOnlyPossible(OptNode n, double memBudget) {
    ExecType et = n.getExecType();
    boolean ret = (et == ExecType.CP);
    if (n.isLeaf() && et == getRemoteExecType()) {
        Hop h = OptTreeConverter.getAbstractPlanMapping().getMappedHop(n.getID());
        if (// e.g., -exec=hadoop
        h.getForcedExecType() != LopProperties.ExecType.MR && h.getForcedExecType() != LopProperties.ExecType.SPARK && // integer dims
        h.hasValidCPDimsAndSize()) {
            double mem = _cost.getLeafNodeEstimate(TestMeasure.MEMORY_USAGE, n, LopProperties.ExecType.CP);
            if (mem <= memBudget)
                ret = true;
        }
    }
    if (!n.isLeaf())
        for (OptNode c : n.getChilds()) {
            // early abort if already false
            if (!ret)
                break;
            ret &= isCPOnlyPossible(c, memBudget);
        }
    return ret;
}
Also used : Hop(org.apache.sysml.hops.Hop) MultiThreadedHop(org.apache.sysml.hops.Hop.MultiThreadedHop) ExecType(org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType)

Example 3 with ExecType

use of org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType in project incubator-systemml by apache.

the class OptimizerRuleBased method rewriteSetDegreeOfParallelism.

// /////
// REWRITE set degree of parallelism
// /
protected void rewriteSetDegreeOfParallelism(OptNode n, double M, boolean flagNested) {
    ExecType type = n.getExecType();
    long id = n.getID();
    // special handling for different exec models (CP, MR, MR nested)
    ParForProgramBlock pfpb = (ParForProgramBlock) OptTreeConverter.getAbstractPlanMapping().getMappedProg(id)[1];
    if (type == ExecType.CP) {
        // determine local max parallelism constraint
        int kMax = ConfigurationManager.isParallelParFor() ? (n.isCPOnly() ? _lkmaxCP : _lkmaxMR) : 1;
        // ensure local memory constraint (for spark more conservative in order to
        // prevent unnecessary guarded collect)
        double mem = (OptimizerUtils.isSparkExecutionMode() && !n.isCPOnly()) ? _lm / 2 : _lm;
        kMax = Math.min(kMax, (int) Math.floor(mem / M));
        kMax = Math.max(kMax, 1);
        // constrain max parfor parallelism by problem size
        int parforK = (int) ((_N < kMax) ? _N : kMax);
        // FIXME rework for nested parfor parallelism and body w/o gpu ops
        if (DMLScript.USE_ACCELERATOR) {
            long perGPUBudget = GPUContextPool.initialGPUMemBudget();
            double maxMemUsage = getMaxCPOnlyBudget(n);
            if (maxMemUsage < perGPUBudget) {
                parforK = GPUContextPool.getDeviceCount();
                parforK = Math.min(parforK, (int) _N);
                LOG.debug("Setting degree of parallelism + [" + parforK + "] for GPU; per GPU budget :[" + perGPUBudget + "], parfor budget :[" + maxMemUsage + "],  max parallelism per GPU : [" + parforK + "]");
            }
        }
        // set parfor degree of parallelism
        pfpb.setDegreeOfParallelism(parforK);
        n.setK(parforK);
        // distribute remaining parallelism
        int remainParforK = getRemainingParallelismParFor(kMax, parforK);
        int remainOpsK = getRemainingParallelismOps(_lkmaxCP, parforK);
        rAssignRemainingParallelism(n, remainParforK, remainOpsK);
    } else // ExecType.MR/ExecType.SPARK
    {
        int kMax = -1;
        if (flagNested) {
            // determine remote max parallelism constraint
            // guaranteed <= _N (see nested)
            pfpb.setDegreeOfParallelism(_rnk);
            n.setK(_rnk);
            // per node (CP only inside)
            kMax = _rkmax / _rnk;
        } else // not nested (default)
        {
            // determine remote max parallelism constraint
            int tmpK = (int) ((_N < _rk) ? _N : _rk);
            pfpb.setDegreeOfParallelism(tmpK);
            n.setK(tmpK);
            // per node (CP only inside)
            kMax = _rkmax / tmpK;
        }
        // ensure remote memory constraint
        // guaranteed >= 1 (see exec strategy)
        kMax = Math.min(kMax, (int) Math.floor(_rm / M));
        if (kMax < 1)
            kMax = 1;
        // disable nested parallelism, if required
        if (!ALLOW_REMOTE_NESTED_PARALLELISM)
            kMax = 1;
        // distribute remaining parallelism and recompile parallel instructions
        rAssignRemainingParallelism(n, kMax, 1);
    }
    _numEvaluatedPlans++;
    LOG.debug(getOptMode() + " OPT: rewrite 'set degree of parallelism' - result=(see EXPLAIN)");
}
Also used : ExecType(org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType) ParForProgramBlock(org.apache.sysml.runtime.controlprogram.ParForProgramBlock)

Example 4 with ExecType

use of org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType in project systemml by apache.

the class OptimizerRuleBased method getMaxCPOnlyBudget.

/**
 * Calculates the maximum memory needed in a CP only Parfor
 * based on the {@link Hop#computeMemEstimate(MemoTable)}  } function
 * called recursively for the "children" of the parfor {@link OptNode}.
 *
 * @param n the parfor {@link OptNode}
 * @return the maximum memory needed for any operation inside a parfor in CP execution mode
 */
protected double getMaxCPOnlyBudget(OptNode n) {
    ExecType et = n.getExecType();
    double ret = 0;
    if (n.isLeaf() && et != getRemoteExecType()) {
        Hop h = OptTreeConverter.getAbstractPlanMapping().getMappedHop(n.getID());
        if (// e.g., -exec=hadoop
        h.getForcedExecType() != LopProperties.ExecType.MR && h.getForcedExecType() != LopProperties.ExecType.SPARK) {
            double mem = _cost.getLeafNodeEstimate(TestMeasure.MEMORY_USAGE, n, LopProperties.ExecType.CP);
            if (mem >= OptimizerUtils.DEFAULT_SIZE) {
            // memory estimate for worst case scenario.
            // optimistically ignoring this
            } else {
                ret = Math.max(ret, mem);
            }
        }
    }
    if (!n.isLeaf()) {
        for (OptNode c : n.getChilds()) {
            ret = Math.max(ret, getMaxCPOnlyBudget(c));
        }
    }
    return ret;
}
Also used : Hop(org.apache.sysml.hops.Hop) MultiThreadedHop(org.apache.sysml.hops.Hop.MultiThreadedHop) ExecType(org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType)

Example 5 with ExecType

use of org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType in project systemml by apache.

the class OptimizerConstrained method optimize.

/**
 * Main optimization procedure.
 *
 * Transformation-based heuristic (rule-based) optimization
 * (no use of sb, direct change of pb).
 */
@Override
public boolean optimize(ParForStatementBlock sb, ParForProgramBlock pb, OptTree plan, CostEstimator est, ExecutionContext ec) {
    LOG.debug("--- " + getOptMode() + " OPTIMIZER -------");
    OptNode pn = plan.getRoot();
    // early abort for empty parfor body
    if (pn.isLeaf())
        return true;
    // ANALYZE infrastructure properties
    super.analyzeProblemAndInfrastructure(pn);
    _cost = est;
    // debug and warnings output
    LOG.debug(getOptMode() + " OPT: Optimize with local_max_mem=" + toMB(_lm) + " and remote_max_mem=" + toMB(_rm) + ").");
    if (_rnk <= 0 || _rk <= 0)
        LOG.warn(getOptMode() + " OPT: Optimize for inactive cluster (num_nodes=" + _rnk + ", num_map_slots=" + _rk + ").");
    // ESTIMATE memory consumption
    ExecType oldET = pn.getExecType();
    int oldK = pn.getK();
    // for basic mem consumption
    pn.setSerialParFor();
    double M0a = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
    pn.setExecType(oldET);
    pn.setK(oldK);
    LOG.debug(getOptMode() + " OPT: estimated mem (serial exec) M=" + toMB(M0a));
    // OPTIMIZE PARFOR PLAN
    // rewrite 1: data partitioning (incl. log. recompile RIX)
    HashMap<String, PartitionFormat> partitionedMatrices = new HashMap<>();
    rewriteSetDataPartitioner(pn, ec.getVariables(), partitionedMatrices, OptimizerUtils.getLocalMemBudget());
    // reestimate
    double M0b = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
    // rewrite 2: remove unnecessary compare matrix
    rewriteRemoveUnnecessaryCompareMatrix(pn, ec);
    // rewrite 3: rewrite result partitioning (incl. log/phy recompile LIX)
    boolean flagLIX = super.rewriteSetResultPartitioning(pn, M0b, ec.getVariables());
    // reestimate
    double M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
    LOG.debug(getOptMode() + " OPT: estimated new mem (serial exec) M=" + toMB(M1));
    // determine memory consumption for what-if: all-cp or partitioned
    double M2 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn, LopProperties.ExecType.CP);
    LOG.debug(getOptMode() + " OPT: estimated new mem (serial exec, all CP) M=" + toMB(M2));
    double M3 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn, true);
    LOG.debug(getOptMode() + " OPT: estimated new mem (cond partitioning) M=" + toMB(M3));
    // rewrite 4: execution strategy
    // keep old
    PExecMode tmpmode = getPExecMode(pn);
    boolean flagRecompMR = rewriteSetExecutionStategy(pn, M0a, M1, M2, M3, flagLIX);
    // exec-type-specific rewrites
    if (pn.getExecType() == getRemoteExecType()) {
        if (M1 > _rm && M3 <= _rm) {
            // rewrite 1: data partitioning (apply conditional partitioning)
            rewriteSetDataPartitioner(pn, ec.getVariables(), partitionedMatrices, M3);
            // reestimate
            M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
        }
        if (flagRecompMR) {
            // rewrite 5: set operations exec type
            rewriteSetOperationsExecType(pn, flagRecompMR);
            // reestimate
            M1 = _cost.getEstimate(TestMeasure.MEMORY_USAGE, pn);
        }
        // rewrite 6: data colocation
        super.rewriteDataColocation(pn, ec.getVariables());
        // rewrite 7: rewrite set partition replication factor
        super.rewriteSetPartitionReplicationFactor(pn, partitionedMatrices, ec.getVariables());
        // rewrite 8: rewrite set partition replication factor
        super.rewriteSetExportReplicationFactor(pn, ec.getVariables());
        // rewrite 10: determine parallelism
        rewriteSetDegreeOfParallelism(pn, M1, false);
        // rewrite 11: task partitioning
        rewriteSetTaskPartitioner(pn, false, flagLIX);
        // rewrite 12: fused data partitioning and execution
        rewriteSetFusedDataPartitioningExecution(pn, M1, flagLIX, partitionedMatrices, ec.getVariables(), tmpmode);
        // rewrite 13: transpose sparse vector operations
        super.rewriteSetTranposeSparseVectorOperations(pn, partitionedMatrices, ec.getVariables());
        // rewrite 14:
        HashSet<ResultVar> inplaceResultVars = new HashSet<>();
        super.rewriteSetInPlaceResultIndexing(pn, M1, ec.getVariables(), inplaceResultVars, ec);
        // rewrite 15:
        super.rewriteDisableCPCaching(pn, inplaceResultVars, ec.getVariables());
    } else // if( pn.getExecType() == ExecType.CP )
    {
        // rewrite 10: determine parallelism
        rewriteSetDegreeOfParallelism(pn, M1, false);
        // rewrite 11: task partitioning
        // flagLIX always false
        rewriteSetTaskPartitioner(pn, false, false);
        // rewrite 14: set in-place result indexing
        HashSet<ResultVar> inplaceResultVars = new HashSet<>();
        super.rewriteSetInPlaceResultIndexing(pn, M1, ec.getVariables(), inplaceResultVars, ec);
        if (!OptimizerUtils.isSparkExecutionMode()) {
            // rewrite 16: runtime piggybacking
            super.rewriteEnableRuntimePiggybacking(pn, ec.getVariables(), partitionedMatrices);
        } else {
            // rewrite 17: checkpoint injection for parfor loop body
            super.rewriteInjectSparkLoopCheckpointing(pn);
            // rewrite 18: repartition read-only inputs for zipmm
            super.rewriteInjectSparkRepartition(pn, ec.getVariables());
            // rewrite 19: eager caching for checkpoint rdds
            super.rewriteSetSparkEagerRDDCaching(pn, ec.getVariables());
        }
    }
    // rewrite 20: set result merge
    rewriteSetResultMerge(pn, ec.getVariables(), true);
    // rewrite 21: set local recompile memory budget
    super.rewriteSetRecompileMemoryBudget(pn);
    // /////
    // Final rewrites for cleanup / minor improvements
    // rewrite 22: parfor (in recursive functions) to for
    super.rewriteRemoveRecursiveParFor(pn, ec.getVariables());
    // rewrite 23: parfor (par=1) to for
    super.rewriteRemoveUnnecessaryParFor(pn);
    // info optimization result
    _numEvaluatedPlans = 1;
    return true;
}
Also used : HashMap(java.util.HashMap) PDataPartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat) PartitionFormat(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PartitionFormat) PExecMode(org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PExecMode) ResultVar(org.apache.sysml.parser.ParForStatementBlock.ResultVar) ExecType(org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType) HashSet(java.util.HashSet)

Aggregations

ExecType (org.apache.sysml.runtime.controlprogram.parfor.opt.OptNode.ExecType)10 Hop (org.apache.sysml.hops.Hop)4 MultiThreadedHop (org.apache.sysml.hops.Hop.MultiThreadedHop)4 ParForProgramBlock (org.apache.sysml.runtime.controlprogram.ParForProgramBlock)4 PExecMode (org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PExecMode)4 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 ResultVar (org.apache.sysml.parser.ParForStatementBlock.ResultVar)2 PDataPartitionFormat (org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitionFormat)2 PDataPartitioner (org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PDataPartitioner)2 PartitionFormat (org.apache.sysml.runtime.controlprogram.ParForProgramBlock.PartitionFormat)2