Search in sources :

Example 1 with UnionOperator

use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.

the class GenMRFileSink1 method processFS.

/**
   * Process the FileSink operator to generate a MoveTask if necessary.
   *
   * @param fsOp
   *          current FileSink operator
   * @param stack
   *          parent operators
   * @param opProcCtx
   * @param chDir
   *          whether the operator should be first output to a tmp dir and then merged
   *          to the final dir later
   * @return the final file name to which the FileSinkOperator should store.
   * @throws SemanticException
   */
private Path processFS(FileSinkOperator fsOp, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir) throws SemanticException {
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    // If the directory needs to be changed, send the new directory
    Path dest = null;
    List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps();
    if (seenFSOps == null) {
        seenFSOps = new ArrayList<FileSinkOperator>();
    }
    if (!seenFSOps.contains(fsOp)) {
        seenFSOps.add(fsOp);
    }
    ctx.setSeenFileSinkOps(seenFSOps);
    dest = GenMapRedUtils.createMoveTask(ctx.getCurrTask(), chDir, fsOp, ctx.getParseCtx(), ctx.getMvTask(), ctx.getConf(), ctx.getDependencyTaskForMultiInsert());
    TableScanOperator currTopOp = ctx.getCurrTopOp();
    String currAliasId = ctx.getCurrAliasId();
    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
    // If it is a map-only job, the task needs to be processed
    if (currTopOp != null) {
        Task<? extends Serializable> mapTask = opTaskMap.get(null);
        if (mapTask == null) {
            if (!ctx.isSeenOp(currTask, currTopOp)) {
                GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx);
            }
            opTaskMap.put(null, currTask);
        } else {
            if (!ctx.isSeenOp(currTask, currTopOp)) {
                GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, mapTask, false, ctx);
            } else {
                UnionOperator currUnionOp = ctx.getCurrUnionOp();
                if (currUnionOp != null) {
                    opTaskMap.put(null, currTask);
                    ctx.setCurrTopOp(null);
                    GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
                    return dest;
                }
            }
        // mapTask and currTask should be merged by and join/union operator
        // (e.g., GenMRUnion1) which has multiple topOps.
        // assert mapTask == currTask : "mapTask.id = " + mapTask.getId()
        // + "; currTask.id = " + currTask.getId();
        }
        return dest;
    }
    UnionOperator currUnionOp = ctx.getCurrUnionOp();
    if (currUnionOp != null) {
        opTaskMap.put(null, currTask);
        GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
        return dest;
    }
    return dest;
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Task(org.apache.hadoop.hive.ql.exec.Task) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) Serializable(java.io.Serializable) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 2 with UnionOperator

use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.

the class GenMRRedSink3 method process.

/**
   * Reduce Scan encountered.
   *
   * @param nd
   *          the reduce sink operator encountered
   * @param opProcCtx
   *          context
   */
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    ReduceSinkOperator op = (ReduceSinkOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    // union consisted on a bunch of map-reduce jobs, and it has been split at
    // the union
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    UnionOperator union = Utils.findNode(stack, UnionOperator.class);
    assert union != null;
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(union);
    Task<? extends Serializable> unionTask = null;
    if (mapredCtx != null) {
        unionTask = mapredCtx.getCurrTask();
    } else {
        unionTask = ctx.getCurrTask();
    }
    MapredWork plan = (MapredWork) unionTask.getWork();
    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
    Task<? extends Serializable> reducerTask = opTaskMap.get(reducer);
    ctx.setCurrTask(unionTask);
    // If the plan for this reducer does not exist, initialize the plan
    if (reducerTask == null) {
        // When the reducer is encountered for the first time
        if (plan.getReduceWork() == null) {
            GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
        // When union is followed by a multi-table insert
        } else {
            GenMapRedUtils.splitPlan(op, ctx);
        }
    } else if (plan.getReduceWork() != null && plan.getReduceWork().getReducer() == reducer) {
        // The union is already initialized. However, the union is walked from
        // another input
        // initUnionPlan is idempotent
        GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
    } else {
        GenMapRedUtils.joinUnionPlan(ctx, union, unionTask, reducerTask, false);
        ctx.setCurrTask(reducerTask);
    }
    mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
    // the union operator has been processed
    ctx.setCurrUnionOp(null);
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 3 with UnionOperator

use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.

the class ColumnPrunerProcCtx method handleFilterUnionChildren.

/**
   * If the input filter operator has direct child(ren) which are union operator,
   * and the filter's column is not the same as union's
   * create select operator between them. The select operator has same number of columns as
   * pruned child operator.
   *
   * @param curOp
   *          The filter operator which need to handle children.
   * @throws SemanticException
   */
public void handleFilterUnionChildren(Operator<? extends OperatorDesc> curOp) throws SemanticException {
    if (curOp.getChildOperators() == null || !(curOp instanceof FilterOperator)) {
        return;
    }
    List<FieldNode> parentPrunList = prunedColLists.get(curOp);
    if (parentPrunList == null || parentPrunList.size() == 0) {
        return;
    }
    List<FieldNode> prunList = null;
    for (Operator<? extends OperatorDesc> child : curOp.getChildOperators()) {
        if (child instanceof UnionOperator) {
            prunList = genColLists(child);
            if (prunList == null || prunList.size() == 0 || parentPrunList.size() == prunList.size()) {
                continue;
            }
            ArrayList<ExprNodeDesc> exprs = new ArrayList<ExprNodeDesc>();
            ArrayList<String> outputColNames = new ArrayList<String>();
            Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
            ArrayList<ColumnInfo> outputRS = new ArrayList<ColumnInfo>();
            for (ColumnInfo colInfo : child.getSchema().getSignature()) {
                if (lookupColumn(prunList, colInfo.getInternalName()) == null) {
                    continue;
                }
                ExprNodeDesc colDesc = new ExprNodeColumnDesc(colInfo.getType(), colInfo.getInternalName(), colInfo.getTabAlias(), colInfo.getIsVirtualCol());
                exprs.add(colDesc);
                outputColNames.add(colInfo.getInternalName());
                ColumnInfo newCol = new ColumnInfo(colInfo.getInternalName(), colInfo.getType(), colInfo.getTabAlias(), colInfo.getIsVirtualCol(), colInfo.isHiddenVirtualCol());
                newCol.setAlias(colInfo.getAlias());
                outputRS.add(newCol);
                colExprMap.put(colInfo.getInternalName(), colDesc);
            }
            SelectDesc select = new SelectDesc(exprs, outputColNames, false);
            curOp.removeChild(child);
            SelectOperator sel = (SelectOperator) OperatorFactory.getAndMakeChild(select, new RowSchema(outputRS), curOp);
            OperatorFactory.makeChild(sel, child);
            sel.setColumnExprMap(colExprMap);
        }
    }
}
Also used : RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) SelectDesc(org.apache.hadoop.hive.ql.plan.SelectDesc)

Example 4 with UnionOperator

use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.

the class ConstantPropagateProcCtx method getPropagatedConstants.

/**
   * Get propagated constant map from parents.
   *
   * Traverse all parents of current operator, if there is propagated constant (determined by
   * assignment expression like column=constant value), resolve the column using RowResolver and add
   * it to current constant map.
   *
   * @param op
   *        operator getting the propagated constants.
   * @return map of ColumnInfo to ExprNodeDesc. The values of that map must be either
   *         ExprNodeConstantDesc or ExprNodeNullDesc.
   */
public Map<ColumnInfo, ExprNodeDesc> getPropagatedConstants(Operator<? extends Serializable> op) {
    // this map should map columnInfo to ExprConstantNodeDesc
    Map<ColumnInfo, ExprNodeDesc> constants = new HashMap<ColumnInfo, ExprNodeDesc>();
    if (op.getSchema() == null) {
        return constants;
    }
    RowSchema rs = op.getSchema();
    LOG.debug("Getting constants of op:" + op + " with rs:" + rs);
    if (op.getParentOperators() == null) {
        return constants;
    }
    // A previous solution is based on tableAlias and colAlias, which is
    // unsafe, esp. when CBO generates derived table names. see HIVE-13602.
    // For correctness purpose, we only trust colExpMap.
    // We assume that CBO can do the constantPropagation before this function is
    // called to help improve the performance.
    // UnionOperator, LimitOperator and FilterOperator are special, they should already be
    // column-position aligned.
    List<Map<Integer, ExprNodeDesc>> parentsToConstant = new ArrayList<>();
    boolean areAllParentsContainConstant = true;
    boolean noParentsContainConstant = true;
    for (Operator<?> parent : op.getParentOperators()) {
        Map<ColumnInfo, ExprNodeDesc> constMap = opToConstantExprs.get(parent);
        if (constMap == null) {
            LOG.debug("Constant of Op " + parent.getOperatorId() + " is not found");
            areAllParentsContainConstant = false;
        } else {
            noParentsContainConstant = false;
            Map<Integer, ExprNodeDesc> map = new HashMap<>();
            for (Entry<ColumnInfo, ExprNodeDesc> entry : constMap.entrySet()) {
                map.put(parent.getSchema().getPosition(entry.getKey().getInternalName()), entry.getValue());
            }
            parentsToConstant.add(map);
            LOG.debug("Constant of Op " + parent.getOperatorId() + " " + constMap);
        }
    }
    if (noParentsContainConstant) {
        return constants;
    }
    ArrayList<ColumnInfo> signature = op.getSchema().getSignature();
    if (op instanceof LimitOperator || op instanceof FilterOperator) {
        // there should be only one parent.
        if (op.getParentOperators().size() == 1) {
            Map<Integer, ExprNodeDesc> parentToConstant = parentsToConstant.get(0);
            for (int index = 0; index < signature.size(); index++) {
                if (parentToConstant.containsKey(index)) {
                    constants.put(signature.get(index), parentToConstant.get(index));
                }
            }
        }
    } else if (op instanceof UnionOperator && areAllParentsContainConstant) {
        for (int index = 0; index < signature.size(); index++) {
            ExprNodeDesc constant = null;
            for (Map<Integer, ExprNodeDesc> parentToConstant : parentsToConstant) {
                if (!parentToConstant.containsKey(index)) {
                    // if this parent does not contain a constant at this position, we
                    // continue to look at other positions.
                    constant = null;
                    break;
                } else {
                    if (constant == null) {
                        constant = parentToConstant.get(index);
                    } else {
                        // compare if they are the same constant.
                        ExprNodeDesc nextConstant = parentToConstant.get(index);
                        if (!nextConstant.isSame(constant)) {
                            // they are not the same constant. for example, union all of 1
                            // and 2.
                            constant = null;
                            break;
                        }
                    }
                }
            }
            // we have checked all the parents for the "index" position.
            if (constant != null) {
                constants.put(signature.get(index), constant);
            }
        }
    } else if (op instanceof JoinOperator) {
        JoinOperator joinOp = (JoinOperator) op;
        Iterator<Entry<Byte, List<ExprNodeDesc>>> itr = joinOp.getConf().getExprs().entrySet().iterator();
        while (itr.hasNext()) {
            Entry<Byte, List<ExprNodeDesc>> e = itr.next();
            int tag = e.getKey();
            Operator<?> parent = op.getParentOperators().get(tag);
            List<ExprNodeDesc> exprs = e.getValue();
            if (exprs == null) {
                continue;
            }
            for (ExprNodeDesc expr : exprs) {
                // we are only interested in ExprNodeColumnDesc
                if (expr instanceof ExprNodeColumnDesc) {
                    String parentColName = ((ExprNodeColumnDesc) expr).getColumn();
                    // find this parentColName in its parent's rs
                    int parentPos = parent.getSchema().getPosition(parentColName);
                    if (parentsToConstant.get(tag).containsKey(parentPos)) {
                        // reverse look up colExprMap to find the childColName
                        if (op.getColumnExprMap() != null && op.getColumnExprMap().entrySet() != null) {
                            for (Entry<String, ExprNodeDesc> entry : op.getColumnExprMap().entrySet()) {
                                if (entry.getValue().isSame(expr)) {
                                    // now propagate the constant from the parent to the child
                                    constants.put(signature.get(op.getSchema().getPosition(entry.getKey())), parentsToConstant.get(tag).get(parentPos));
                                }
                            }
                        }
                    }
                }
            }
        }
    } else {
        // there should be only one parent.
        if (op.getParentOperators().size() == 1) {
            Operator<?> parent = op.getParentOperators().get(0);
            if (op.getColumnExprMap() != null && op.getColumnExprMap().entrySet() != null) {
                for (Entry<String, ExprNodeDesc> entry : op.getColumnExprMap().entrySet()) {
                    if (op.getSchema().getPosition(entry.getKey()) == -1) {
                        // Not present
                        continue;
                    }
                    ExprNodeDesc expr = entry.getValue();
                    if (expr instanceof ExprNodeColumnDesc) {
                        String parentColName = ((ExprNodeColumnDesc) expr).getColumn();
                        // find this parentColName in its parent's rs
                        int parentPos = parent.getSchema().getPosition(parentColName);
                        if (parentsToConstant.get(0).containsKey(parentPos)) {
                            // this position in parent is a constant
                            // now propagate the constant from the parent to the child
                            constants.put(signature.get(op.getSchema().getPosition(entry.getKey())), parentsToConstant.get(0).get(parentPos));
                        }
                    }
                }
            }
        }
    }
    LOG.debug("Offering constants " + constants.keySet() + " to operator " + op.toString());
    return constants;
}
Also used : JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) Entry(java.util.Map.Entry) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ArrayList(java.util.ArrayList) List(java.util.List) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) HashMap(java.util.HashMap) Map(java.util.Map)

Example 5 with UnionOperator

use of org.apache.hadoop.hive.ql.exec.UnionOperator in project hive by apache.

the class GenMRUnion1 method process.

/**
   * Union Operator encountered . Currently, the algorithm is pretty simple: If
   * all the sub-queries are map-only, don't do anything. Otherwise, insert a
   * FileSink on top of all the sub-queries.
   *
   * This can be optimized later on.
   *
   * @param nd
   *          the file sink operator encountered
   * @param opProcCtx
   *          context
   */
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    UnionOperator union = (UnionOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    UnionProcContext uCtx = parseCtx.getUCtx();
    // Map-only subqueries can be optimized in future to not write to a file in
    // future
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    if (union.getConf().isAllInputsInSameReducer()) {
        // All inputs of this UnionOperator are in the same Reducer.
        // We do not need to break the operator tree.
        mapCurrCtx.put((Operator<? extends OperatorDesc>) nd, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
        return null;
    }
    UnionParseContext uPrsCtx = uCtx.getUnionParseContext(union);
    ctx.setCurrUnionOp(union);
    // map-reduce job
    if (uPrsCtx.allMapOnlySubQ()) {
        return processMapOnlyUnion(union, stack, ctx, uCtx);
    }
    assert uPrsCtx != null;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    int pos = UnionProcFactory.getPositionParent(union, stack);
    Task<? extends Serializable> uTask = null;
    MapredWork uPlan = null;
    // union is encountered for the first time
    GenMRUnionCtx uCtxTask = ctx.getUnionTask(union);
    if (uCtxTask == null) {
        uPlan = GenMapRedUtils.getMapRedWork(parseCtx);
        uTask = TaskFactory.get(uPlan, parseCtx.getConf());
        uCtxTask = new GenMRUnionCtx(uTask);
        ctx.setUnionTask(union, uCtxTask);
    } else {
        uTask = uCtxTask.getUTask();
    }
    // Copy into the current union task plan if
    if (uPrsCtx.getMapOnlySubq(pos) && uPrsCtx.getRootTask(pos)) {
        processSubQueryUnionMerge(ctx, uCtxTask, union, stack);
        if (ctx.getRootTasks().contains(currTask)) {
            ctx.getRootTasks().remove(currTask);
        }
    } else // If it a map-reduce job, create a temporary file
    {
        // is the current task a root task
        if (shouldBeRootTask(currTask) && !ctx.getRootTasks().contains(currTask) && (currTask.getParentTasks() == null || currTask.getParentTasks().isEmpty())) {
            ctx.getRootTasks().add(currTask);
        }
        processSubQueryUnionCreateIntermediate(union.getParentOperators().get(pos), union, uTask, ctx, uCtxTask);
        // the currAliasId and CurrTopOp is not valid any more
        ctx.setCurrAliasId(null);
        ctx.setCurrTopOp(null);
        ctx.getOpTaskMap().put(null, uTask);
    }
    ctx.setCurrTask(uTask);
    mapCurrCtx.put((Operator<? extends OperatorDesc>) nd, new GenMapRedCtx(ctx.getCurrTask(), null));
    return true;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) UnionProcContext(org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext) UnionParseContext(org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext) GenMRUnionCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMRUnionCtx) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) UnionParseContext(org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Aggregations

UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)11 Operator (org.apache.hadoop.hive.ql.exec.Operator)6 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)5 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)5 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)4 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)4 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)4 Serializable (java.io.Serializable)3 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)3 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)3 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)3 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)3 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)3 Task (org.apache.hadoop.hive.ql.exec.Task)3 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)3 LinkedHashMap (java.util.LinkedHashMap)2 Map (java.util.Map)2 Entry (java.util.Map.Entry)2