Search in sources :

Example 31 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class GenMapRedUtils method initPlan.

/**
 * Initialize the current plan by adding it to root tasks.
 *
 * @param op
 *          the reduce sink operator encountered
 * @param opProcCtx
 *          processing context
 */
public static void initPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx) throws SemanticException {
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork plan = (MapredWork) currTask.getWork();
    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap();
    TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
    opTaskMap.put(reducer, currTask);
    plan.setReduceWork(new ReduceWork());
    plan.getReduceWork().setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();
    plan.getReduceWork().setNumReduceTasks(desc.getNumReducers());
    if (needsTagging(plan.getReduceWork())) {
        plan.getReduceWork().setNeedsTagging(true);
    }
    assert currTopOp != null;
    String currAliasId = opProcCtx.getCurrAliasId();
    if (!opProcCtx.isSeenOp(currTask, currTopOp)) {
        setTaskPlan(currAliasId, currTopOp, currTask, false, opProcCtx);
    }
    currTopOp = null;
    currAliasId = null;
    opProcCtx.setCurrTask(currTask);
    opProcCtx.setCurrTopOp(currTopOp);
    opProcCtx.setCurrAliasId(currAliasId);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 32 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class GenMapRedUtils method splitTasks.

@SuppressWarnings("nls")
private static /**
 * Split two tasks by creating a temporary file between them.
 *
 * @param op reduce sink operator being processed
 * @param parentTask the parent task
 * @param childTask the child task
 * @param opProcCtx context
 */
void splitTasks(ReduceSinkOperator op, Task<? extends Serializable> parentTask, Task<? extends Serializable> childTask, GenMRProcContext opProcCtx) throws SemanticException {
    if (op.getNumParent() != 1) {
        throw new IllegalStateException("Expecting operator " + op + " to have one parent. " + "But found multiple parents : " + op.getParentOperators());
    }
    ParseContext parseCtx = opProcCtx.getParseCtx();
    parentTask.addDependentTask(childTask);
    // Root Task cannot depend on any other task, therefore childTask cannot be
    // a root Task
    List<Task<? extends Serializable>> rootTasks = opProcCtx.getRootTasks();
    if (rootTasks.contains(childTask)) {
        rootTasks.remove(childTask);
    }
    // Generate the temporary file name
    Context baseCtx = parseCtx.getContext();
    Path taskTmpDir = baseCtx.getMRTmpPath();
    Operator<? extends OperatorDesc> parent = op.getParentOperators().get(0);
    TableDesc tt_desc = PlanUtils.getIntermediateFileTableDesc(PlanUtils.getFieldSchemasFromRowSchema(parent.getSchema(), "temporarycol"));
    // Create the temporary file, its corresponding FileSinkOperaotr, and
    // its corresponding TableScanOperator.
    TableScanOperator tableScanOp = createTemporaryFile(parent, op, taskTmpDir, tt_desc, parseCtx);
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    mapCurrCtx.put(tableScanOp, new GenMapRedCtx(childTask, null));
    String streamDesc = taskTmpDir.toUri().toString();
    MapredWork cplan = (MapredWork) childTask.getWork();
    if (needsTagging(cplan.getReduceWork())) {
        Operator<? extends OperatorDesc> reducerOp = cplan.getReduceWork().getReducer();
        String id = null;
        if (reducerOp instanceof JoinOperator) {
            if (parseCtx.getJoinOps().contains(reducerOp)) {
                id = ((JoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof MapJoinOperator) {
            if (parseCtx.getMapJoinOps().contains(reducerOp)) {
                id = ((MapJoinOperator) reducerOp).getConf().getId();
            }
        } else if (reducerOp instanceof SMBMapJoinOperator) {
            if (parseCtx.getSmbMapJoinOps().contains(reducerOp)) {
                id = ((SMBMapJoinOperator) reducerOp).getConf().getId();
            }
        }
        if (id != null) {
            streamDesc = id + ":$INTNAME";
        } else {
            streamDesc = "$INTNAME";
        }
        String origStreamDesc = streamDesc;
        int pos = 0;
        while (cplan.getMapWork().getAliasToWork().get(streamDesc) != null) {
            streamDesc = origStreamDesc.concat(String.valueOf(++pos));
        }
        // TODO: Allocate work to remove the temporary files and make that
        // dependent on the redTask
        cplan.getReduceWork().setNeedsTagging(true);
    }
    // Add the path to alias mapping
    setTaskPlan(taskTmpDir, streamDesc, tableScanOp, cplan.getMapWork(), false, tt_desc);
    opProcCtx.setCurrTopOp(null);
    opProcCtx.setCurrAliasId(null);
    opProcCtx.setCurrTask(childTask);
    opProcCtx.addRootIfPossible(parentTask);
}
Also used : ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) Context(org.apache.hadoop.hive.ql.Context) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) Path(org.apache.hadoop.fs.Path) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 33 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class GenMapRedUtils method setKeyAndValueDesc.

/**
 * set key and value descriptor.
 *
 * @param plan
 *          current plan
 * @param topOp
 *          current top operator in the path
 */
public static void setKeyAndValueDesc(ReduceWork plan, Operator<? extends OperatorDesc> topOp) {
    if (topOp == null) {
        return;
    }
    if (topOp instanceof ReduceSinkOperator) {
        ReduceSinkOperator rs = (ReduceSinkOperator) topOp;
        setKeyAndValueDesc(plan, rs);
    } else {
        List<Operator<? extends OperatorDesc>> children = topOp.getChildOperators();
        if (children != null) {
            for (Operator<? extends OperatorDesc> op : children) {
                setKeyAndValueDesc(plan, op);
            }
        }
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 34 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class SharedWorkOptimizer method extractSharedOptimizationInfo.

private static SharedResult extractSharedOptimizationInfo(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, Operator<?> retainableOpEqualParent, Operator<?> discardableOpEqualParent, Operator<?> retainableOp, Operator<?> discardableOp, LinkedHashSet<Operator<?>> retainableOps, LinkedHashSet<Operator<?>> discardableOps, Set<Operator<?>> discardableInputOps, boolean removeInputBranch) throws SemanticException {
    Operator<?> equalOp1 = retainableOpEqualParent;
    Operator<?> equalOp2 = discardableOpEqualParent;
    Operator<?> currentOp1 = retainableOp;
    Operator<?> currentOp2 = discardableOp;
    long dataSize = 0L;
    long maxDataSize = 0L;
    // Try to merge rest of operators
    while (!(currentOp1 instanceof ReduceSinkOperator)) {
        // Check whether current operators are equal
        if (!compareOperator(pctx, currentOp1, currentOp2)) {
            // If they are not equal, we could zip up till here
            break;
        }
        if (currentOp1.getParentOperators().size() != currentOp2.getParentOperators().size()) {
            // If they are not equal, we could zip up till here
            break;
        }
        if (currentOp1.getParentOperators().size() > 1) {
            List<Operator<?>> discardableOpsForCurrentOp = new ArrayList<>();
            int idx = 0;
            for (; idx < currentOp1.getParentOperators().size(); idx++) {
                Operator<?> parentOp1 = currentOp1.getParentOperators().get(idx);
                Operator<?> parentOp2 = currentOp2.getParentOperators().get(idx);
                if (parentOp1 == equalOp1 && parentOp2 == equalOp2 && !removeInputBranch) {
                    continue;
                }
                if ((parentOp1 == equalOp1 && parentOp2 != equalOp2) || (parentOp1 != equalOp1 && parentOp2 == equalOp2)) {
                    // Input operator is not in the same position
                    break;
                }
                // Compare input
                List<Operator<?>> removeOpsForCurrentInput = compareAndGatherOps(pctx, parentOp1, parentOp2);
                if (removeOpsForCurrentInput == null) {
                    // Inputs are not the same, bail out
                    break;
                }
                // Add inputs to ops to remove
                discardableOpsForCurrentOp.addAll(removeOpsForCurrentInput);
            }
            if (idx != currentOp1.getParentOperators().size()) {
                // If inputs are not equal, we could zip up till here
                break;
            }
            discardableInputOps.addAll(discardableOpsForCurrentOp);
        }
        equalOp1 = currentOp1;
        equalOp2 = currentOp2;
        retainableOps.add(equalOp1);
        discardableOps.add(equalOp2);
        if (equalOp1 instanceof MapJoinOperator) {
            MapJoinOperator mop = (MapJoinOperator) equalOp1;
            dataSize = StatsUtils.safeAdd(dataSize, mop.getConf().getInMemoryDataSize());
            maxDataSize = mop.getConf().getMemoryMonitorInfo().getAdjustedNoConditionalTaskSize();
        }
        if (currentOp1.getChildOperators().size() > 1 || currentOp2.getChildOperators().size() > 1) {
            // TODO: Support checking multiple child operators to merge further.
            break;
        }
        // Update for next iteration
        currentOp1 = currentOp1.getChildOperators().get(0);
        currentOp2 = currentOp2.getChildOperators().get(0);
    }
    // Add the rest to the memory consumption
    Set<Operator<?>> opsWork1 = findWorkOperators(optimizerCache, currentOp1);
    for (Operator<?> op : opsWork1) {
        if (op instanceof MapJoinOperator && !retainableOps.contains(op)) {
            MapJoinOperator mop = (MapJoinOperator) op;
            dataSize = StatsUtils.safeAdd(dataSize, mop.getConf().getInMemoryDataSize());
            maxDataSize = mop.getConf().getMemoryMonitorInfo().getAdjustedNoConditionalTaskSize();
        }
    }
    Set<Operator<?>> opsWork2 = findWorkOperators(optimizerCache, currentOp2);
    for (Operator<?> op : opsWork2) {
        if (op instanceof MapJoinOperator && !discardableOps.contains(op)) {
            MapJoinOperator mop = (MapJoinOperator) op;
            dataSize = StatsUtils.safeAdd(dataSize, mop.getConf().getInMemoryDataSize());
            maxDataSize = mop.getConf().getMemoryMonitorInfo().getAdjustedNoConditionalTaskSize();
        }
    }
    discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, discardableInputOps));
    discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, discardableOps));
    discardableInputOps.addAll(gatherDPPBranchOps(pctx, optimizerCache, retainableOps, discardableInputOps));
    return new SharedResult(retainableOps, discardableOps, discardableInputOps, dataSize, maxDataSize);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList)

Example 35 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class SharedWorkOptimizer method areMergeable.

// FIXME: probably this should also be integrated with isSame() logics
private static boolean areMergeable(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, TableScanOperator tsOp1, TableScanOperator tsOp2) throws SemanticException {
    // First we check if the two table scan operators can actually be merged
    // If schemas do not match, we currently do not merge
    List<String> prevTsOpNeededColumns = tsOp1.getNeededColumns();
    List<String> tsOpNeededColumns = tsOp2.getNeededColumns();
    if (prevTsOpNeededColumns.size() != tsOpNeededColumns.size()) {
        return false;
    }
    boolean notEqual = false;
    for (int i = 0; i < prevTsOpNeededColumns.size(); i++) {
        if (!prevTsOpNeededColumns.get(i).equals(tsOpNeededColumns.get(i))) {
            notEqual = true;
            break;
        }
    }
    if (notEqual) {
        return false;
    }
    // If row limit does not match, we currently do not merge
    if (tsOp1.getConf().getRowLimit() != tsOp2.getConf().getRowLimit()) {
        return false;
    }
    // If partitions do not match, we currently do not merge
    PrunedPartitionList prevTsOpPPList = pctx.getPrunedPartitions(tsOp1);
    PrunedPartitionList tsOpPPList = pctx.getPrunedPartitions(tsOp2);
    if (!prevTsOpPPList.getPartitions().equals(tsOpPPList.getPartitions())) {
        return false;
    }
    // If is a DPP, check if actually it refers to same target, column, etc.
    // Further, the DPP value needs to be generated from same subtree
    List<Operator<?>> dppsOp1 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp1));
    List<Operator<?>> dppsOp2 = new ArrayList<>(optimizerCache.tableScanToDPPSource.get(tsOp2));
    if (dppsOp1.isEmpty() && dppsOp2.isEmpty()) {
        return true;
    }
    for (int i = 0; i < dppsOp1.size(); i++) {
        Operator<?> op = dppsOp1.get(i);
        if (op instanceof ReduceSinkOperator) {
            Set<Operator<?>> ascendants = findAscendantWorkOperators(pctx, optimizerCache, op);
            if (ascendants.contains(tsOp2)) {
                // This should not happen, we cannot merge
                return false;
            }
        }
    }
    for (int i = 0; i < dppsOp2.size(); i++) {
        Operator<?> op = dppsOp2.get(i);
        if (op instanceof ReduceSinkOperator) {
            Set<Operator<?>> ascendants = findAscendantWorkOperators(pctx, optimizerCache, op);
            if (ascendants.contains(tsOp1)) {
                // This should not happen, we cannot merge
                return false;
            }
        }
    }
    if (dppsOp1.size() != dppsOp2.size()) {
        // Only first or second operator contains DPP pruning
        return false;
    }
    // Check if DPP branches are equal
    BitSet bs = new BitSet();
    for (int i = 0; i < dppsOp1.size(); i++) {
        Operator<?> dppOp1 = dppsOp1.get(i);
        for (int j = 0; j < dppsOp2.size(); j++) {
            if (!bs.get(j)) {
                // If not visited yet
                Operator<?> dppOp2 = dppsOp2.get(j);
                if (compareAndGatherOps(pctx, dppOp1, dppOp2) != null) {
                    // The DPP operator/branch are equal
                    bs.set(j);
                    break;
                }
            }
        }
        if (bs.cardinality() < i + 1) {
            return false;
        }
    }
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList) BitSet(java.util.BitSet)

Aggregations

ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)86 Operator (org.apache.hadoop.hive.ql.exec.Operator)50 ArrayList (java.util.ArrayList)48 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)45 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)35 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)31 HashMap (java.util.HashMap)29 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)28 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)27 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)26 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)26 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)25 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)24 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)23 List (java.util.List)19 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)19 LinkedHashMap (java.util.LinkedHashMap)18 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)18 ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)18 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)15