Search in sources :

Example 16 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class SemanticAnalyzer method genReduceSinkPlan.

private Operator genReduceSinkPlan(String dest, QB qb, Operator<?> input, int numReducers, boolean hasOrderBy) throws SemanticException {
    RowResolver inputRR = opParseCtx.get(input).getRowResolver();
    // First generate the expression for the partition and sort keys
    // The cluster by clause / distribute by clause has the aliases for
    // partition function
    ASTNode partitionExprs = qb.getParseInfo().getClusterByForClause(dest);
    if (partitionExprs == null) {
        partitionExprs = qb.getParseInfo().getDistributeByForClause(dest);
    }
    ArrayList<ExprNodeDesc> partCols = new ArrayList<ExprNodeDesc>();
    if (partitionExprs != null) {
        int ccount = partitionExprs.getChildCount();
        for (int i = 0; i < ccount; ++i) {
            ASTNode cl = (ASTNode) partitionExprs.getChild(i);
            partCols.add(genExprNodeDesc(cl, inputRR));
        }
    }
    ASTNode sortExprs = qb.getParseInfo().getClusterByForClause(dest);
    if (sortExprs == null) {
        sortExprs = qb.getParseInfo().getSortByForClause(dest);
    }
    if (sortExprs == null) {
        sortExprs = qb.getParseInfo().getOrderByForClause(dest);
        if (sortExprs != null) {
            assert numReducers == 1;
            // in strict mode, in the presence of order by, limit must be specified
            if (qb.getParseInfo().getDestLimit(dest) == null) {
                String error = StrictChecks.checkNoLimit(conf);
                if (error != null) {
                    throw new SemanticException(generateErrorMessage(sortExprs, error));
                }
            }
        }
    }
    ArrayList<ExprNodeDesc> sortCols = new ArrayList<ExprNodeDesc>();
    StringBuilder order = new StringBuilder();
    StringBuilder nullOrder = new StringBuilder();
    if (sortExprs != null) {
        int ccount = sortExprs.getChildCount();
        for (int i = 0; i < ccount; ++i) {
            ASTNode cl = (ASTNode) sortExprs.getChild(i);
            if (cl.getType() == HiveParser.TOK_TABSORTCOLNAMEASC) {
                // SortBy ASC
                order.append("+");
                cl = (ASTNode) cl.getChild(0);
                if (cl.getType() == HiveParser.TOK_NULLS_FIRST) {
                    nullOrder.append("a");
                } else if (cl.getType() == HiveParser.TOK_NULLS_LAST) {
                    nullOrder.append("z");
                } else {
                    throw new SemanticException("Unexpected null ordering option: " + cl.getType());
                }
                cl = (ASTNode) cl.getChild(0);
            } else if (cl.getType() == HiveParser.TOK_TABSORTCOLNAMEDESC) {
                // SortBy DESC
                order.append("-");
                cl = (ASTNode) cl.getChild(0);
                if (cl.getType() == HiveParser.TOK_NULLS_FIRST) {
                    nullOrder.append("a");
                } else if (cl.getType() == HiveParser.TOK_NULLS_LAST) {
                    nullOrder.append("z");
                } else {
                    throw new SemanticException("Unexpected null ordering option: " + cl.getType());
                }
                cl = (ASTNode) cl.getChild(0);
            } else {
                // ClusterBy
                order.append("+");
                nullOrder.append("a");
            }
            ExprNodeDesc exprNode = genExprNodeDesc(cl, inputRR);
            sortCols.add(exprNode);
        }
    }
    Operator result = genReduceSinkPlan(input, partCols, sortCols, order.toString(), nullOrder.toString(), numReducers, Operation.NOT_ACID, true);
    if (result.getParentOperators().size() == 1 && result.getParentOperators().get(0) instanceof ReduceSinkOperator) {
        ((ReduceSinkOperator) result.getParentOperators().get(0)).getConf().setHasOrderBy(hasOrderBy);
    }
    return result;
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ArrayList(java.util.ArrayList) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Example 17 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class GenMRRedSink1 method process.

/**
   * Reduce Sink encountered.
   * a) If we are seeing this RS for first time, we initialize plan corresponding to this RS.
   * b) If we are seeing this RS for second or later time then either query had a join in which
   *    case we will merge this plan with earlier plan involving this RS or plan for this RS
   *    needs to be split in two branches.
   *
   * @param nd
   *          the reduce sink operator encountered
   * @param opProcCtx
   *          context
   */
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    ReduceSinkOperator op = (ReduceSinkOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(stack.get(stack.size() - 2));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork currPlan = (MapredWork) currTask.getWork();
    String currAliasId = mapredCtx.getCurrAliasId();
    if (op.getNumChild() != 1) {
        throw new IllegalStateException("Expecting operator " + op + " to have one child. " + "But found multiple children : " + op.getChildOperators());
    }
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    Task<? extends Serializable> oldTask = ctx.getOpTaskMap().get(reducer);
    ctx.setCurrAliasId(currAliasId);
    ctx.setCurrTask(currTask);
    // If the plan for this reducer does not exist, initialize the plan
    if (oldTask == null) {
        if (currPlan.getReduceWork() == null) {
            GenMapRedUtils.initPlan(op, ctx);
        } else {
            GenMapRedUtils.splitPlan(op, ctx);
        }
    } else {
        // This will happen in case of joins. The current plan can be thrown away
        // after being merged with the original plan
        GenMapRedUtils.joinPlan(currTask, oldTask, ctx);
        currTask = oldTask;
        ctx.setCurrTask(currTask);
    }
    mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
    if (GenMapRedUtils.hasBranchFinished(nodeOutputs)) {
        ctx.addRootIfPossible(currTask);
        return false;
    }
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 18 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class GenMRRedSink2 method process.

/**
   * Reduce Scan encountered.
   *
   * @param nd
   *          the reduce sink operator encountered
   * @param opProcCtx
   *          context
   */
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    ReduceSinkOperator op = (ReduceSinkOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    String currAliasId = mapredCtx.getCurrAliasId();
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
    Task<? extends Serializable> oldTask = opTaskMap.get(reducer);
    ctx.setCurrAliasId(currAliasId);
    ctx.setCurrTask(currTask);
    if (oldTask == null) {
        GenMapRedUtils.splitPlan(op, ctx);
    } else {
        GenMapRedUtils.splitPlan(op, currTask, oldTask, ctx);
        currTask = oldTask;
        ctx.setCurrTask(currTask);
    }
    mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
    if (GenMapRedUtils.hasBranchFinished(nodeOutputs)) {
        ctx.addRootIfPossible(currTask);
        return false;
    }
    return true;
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 19 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class GenMRRedSink3 method process.

/**
   * Reduce Scan encountered.
   *
   * @param nd
   *          the reduce sink operator encountered
   * @param opProcCtx
   *          context
   */
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    ReduceSinkOperator op = (ReduceSinkOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    // union consisted on a bunch of map-reduce jobs, and it has been split at
    // the union
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    UnionOperator union = Utils.findNode(stack, UnionOperator.class);
    assert union != null;
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(union);
    Task<? extends Serializable> unionTask = null;
    if (mapredCtx != null) {
        unionTask = mapredCtx.getCurrTask();
    } else {
        unionTask = ctx.getCurrTask();
    }
    MapredWork plan = (MapredWork) unionTask.getWork();
    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
    Task<? extends Serializable> reducerTask = opTaskMap.get(reducer);
    ctx.setCurrTask(unionTask);
    // If the plan for this reducer does not exist, initialize the plan
    if (reducerTask == null) {
        // When the reducer is encountered for the first time
        if (plan.getReduceWork() == null) {
            GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
        // When union is followed by a multi-table insert
        } else {
            GenMapRedUtils.splitPlan(op, ctx);
        }
    } else if (plan.getReduceWork() != null && plan.getReduceWork().getReducer() == reducer) {
        // The union is already initialized. However, the union is walked from
        // another input
        // initUnionPlan is idempotent
        GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
    } else {
        GenMapRedUtils.joinUnionPlan(ctx, union, unionTask, reducerTask, false);
        ctx.setCurrTask(reducerTask);
    }
    mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
    // the union operator has been processed
    ctx.setCurrUnionOp(null);
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 20 with ReduceSinkOperator

use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.

the class ConvertJoinMapJoin method checkConvertJoinSMBJoin.

/*
   * This method tries to convert a join to an SMB. This is done based on
   * traits. If the sorted by columns are the same as the join columns then, we
   * can convert the join to an SMB. Otherwise retain the bucket map join as it
   * is still more efficient than a regular join.
   */
private boolean checkConvertJoinSMBJoin(JoinOperator joinOp, OptimizeTezProcContext context, int bigTablePosition, TezBucketJoinProcCtx tezBucketJoinProcCtx) throws SemanticException {
    ReduceSinkOperator bigTableRS = (ReduceSinkOperator) joinOp.getParentOperators().get(bigTablePosition);
    int numBuckets = bigTableRS.getParentOperators().get(0).getOpTraits().getNumBuckets();
    int size = -1;
    for (Operator<?> parentOp : joinOp.getParentOperators()) {
        // each side better have 0 or more RS. if either side is unbalanced, cannot convert.
        // This is a workaround for now. Right fix would be to refactor code in the
        // MapRecordProcessor and ReduceRecordProcessor with respect to the sources.
        @SuppressWarnings({ "rawtypes", "unchecked" }) Set<ReduceSinkOperator> set = OperatorUtils.findOperatorsUpstream(parentOp.getParentOperators(), ReduceSinkOperator.class);
        if (size < 0) {
            size = set.size();
            continue;
        }
        if (((size > 0) && (set.size() > 0)) || ((size == 0) && (set.size() == 0))) {
            continue;
        } else {
            return false;
        }
    }
    // transformation of the join operation
    for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
        if (!(parentOp instanceof ReduceSinkOperator)) {
            // could be mux/demux operators. Currently not supported
            LOG.info("Found correlation optimizer operators. Cannot convert to SMB at this time.");
            return false;
        }
        ReduceSinkOperator rsOp = (ReduceSinkOperator) parentOp;
        if (checkColEquality(rsOp.getParentOperators().get(0).getOpTraits().getSortCols(), rsOp.getOpTraits().getSortCols(), rsOp.getColumnExprMap(), tezBucketJoinProcCtx, false) == false) {
            LOG.info("We cannot convert to SMB because the sort column names do not match.");
            return false;
        }
        if (checkColEquality(rsOp.getParentOperators().get(0).getOpTraits().getBucketColNames(), rsOp.getOpTraits().getBucketColNames(), rsOp.getColumnExprMap(), tezBucketJoinProcCtx, true) == false) {
            LOG.info("We cannot convert to SMB because bucket column names do not match.");
            return false;
        }
    }
    if (numBuckets < 0) {
        numBuckets = bigTableRS.getConf().getNumReducers();
    }
    tezBucketJoinProcCtx.setNumBuckets(numBuckets);
    LOG.info("We can convert the join to an SMB join.");
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)

Aggregations

ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)62 Operator (org.apache.hadoop.hive.ql.exec.Operator)37 ArrayList (java.util.ArrayList)34 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)29 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)27 HashMap (java.util.HashMap)23 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)21 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)20 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)18 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)18 List (java.util.List)17 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)16 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)14 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)14 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)14 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)14 LinkedHashMap (java.util.LinkedHashMap)13 ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)12