Search in sources :

Example 26 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class GenMRFileSink1 method processFS.

/**
   * Process the FileSink operator to generate a MoveTask if necessary.
   *
   * @param fsOp
   *          current FileSink operator
   * @param stack
   *          parent operators
   * @param opProcCtx
   * @param chDir
   *          whether the operator should be first output to a tmp dir and then merged
   *          to the final dir later
   * @return the final file name to which the FileSinkOperator should store.
   * @throws SemanticException
   */
private Path processFS(FileSinkOperator fsOp, Stack<Node> stack, NodeProcessorCtx opProcCtx, boolean chDir) throws SemanticException {
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    // If the directory needs to be changed, send the new directory
    Path dest = null;
    List<FileSinkOperator> seenFSOps = ctx.getSeenFileSinkOps();
    if (seenFSOps == null) {
        seenFSOps = new ArrayList<FileSinkOperator>();
    }
    if (!seenFSOps.contains(fsOp)) {
        seenFSOps.add(fsOp);
    }
    ctx.setSeenFileSinkOps(seenFSOps);
    dest = GenMapRedUtils.createMoveTask(ctx.getCurrTask(), chDir, fsOp, ctx.getParseCtx(), ctx.getMvTask(), ctx.getConf(), ctx.getDependencyTaskForMultiInsert());
    TableScanOperator currTopOp = ctx.getCurrTopOp();
    String currAliasId = ctx.getCurrAliasId();
    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
    // If it is a map-only job, the task needs to be processed
    if (currTopOp != null) {
        Task<? extends Serializable> mapTask = opTaskMap.get(null);
        if (mapTask == null) {
            if (!ctx.isSeenOp(currTask, currTopOp)) {
                GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, currTask, false, ctx);
            }
            opTaskMap.put(null, currTask);
        } else {
            if (!ctx.isSeenOp(currTask, currTopOp)) {
                GenMapRedUtils.setTaskPlan(currAliasId, currTopOp, mapTask, false, ctx);
            } else {
                UnionOperator currUnionOp = ctx.getCurrUnionOp();
                if (currUnionOp != null) {
                    opTaskMap.put(null, currTask);
                    ctx.setCurrTopOp(null);
                    GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
                    return dest;
                }
            }
        // mapTask and currTask should be merged by and join/union operator
        // (e.g., GenMRUnion1) which has multiple topOps.
        // assert mapTask == currTask : "mapTask.id = " + mapTask.getId()
        // + "; currTask.id = " + currTask.getId();
        }
        return dest;
    }
    UnionOperator currUnionOp = ctx.getCurrUnionOp();
    if (currUnionOp != null) {
        opTaskMap.put(null, currTask);
        GenMapRedUtils.initUnionPlan(ctx, currUnionOp, currTask, false);
        return dest;
    }
    return dest;
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Task(org.apache.hadoop.hive.ql.exec.Task) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) Serializable(java.io.Serializable) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 27 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class GenMRRedSink1 method process.

/**
   * Reduce Sink encountered.
   * a) If we are seeing this RS for first time, we initialize plan corresponding to this RS.
   * b) If we are seeing this RS for second or later time then either query had a join in which
   *    case we will merge this plan with earlier plan involving this RS or plan for this RS
   *    needs to be split in two branches.
   *
   * @param nd
   *          the reduce sink operator encountered
   * @param opProcCtx
   *          context
   */
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    ReduceSinkOperator op = (ReduceSinkOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(stack.get(stack.size() - 2));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork currPlan = (MapredWork) currTask.getWork();
    String currAliasId = mapredCtx.getCurrAliasId();
    if (op.getNumChild() != 1) {
        throw new IllegalStateException("Expecting operator " + op + " to have one child. " + "But found multiple children : " + op.getChildOperators());
    }
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    Task<? extends Serializable> oldTask = ctx.getOpTaskMap().get(reducer);
    ctx.setCurrAliasId(currAliasId);
    ctx.setCurrTask(currTask);
    // If the plan for this reducer does not exist, initialize the plan
    if (oldTask == null) {
        if (currPlan.getReduceWork() == null) {
            GenMapRedUtils.initPlan(op, ctx);
        } else {
            GenMapRedUtils.splitPlan(op, ctx);
        }
    } else {
        // This will happen in case of joins. The current plan can be thrown away
        // after being merged with the original plan
        GenMapRedUtils.joinPlan(currTask, oldTask, ctx);
        currTask = oldTask;
        ctx.setCurrTask(currTask);
    }
    mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
    if (GenMapRedUtils.hasBranchFinished(nodeOutputs)) {
        ctx.addRootIfPossible(currTask);
        return false;
    }
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 28 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class GenMRRedSink2 method process.

/**
   * Reduce Scan encountered.
   *
   * @param nd
   *          the reduce sink operator encountered
   * @param opProcCtx
   *          context
   */
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    ReduceSinkOperator op = (ReduceSinkOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    String currAliasId = mapredCtx.getCurrAliasId();
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
    Task<? extends Serializable> oldTask = opTaskMap.get(reducer);
    ctx.setCurrAliasId(currAliasId);
    ctx.setCurrTask(currTask);
    if (oldTask == null) {
        GenMapRedUtils.splitPlan(op, ctx);
    } else {
        GenMapRedUtils.splitPlan(op, currTask, oldTask, ctx);
        currTask = oldTask;
        ctx.setCurrTask(currTask);
    }
    mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
    if (GenMapRedUtils.hasBranchFinished(nodeOutputs)) {
        ctx.addRootIfPossible(currTask);
        return false;
    }
    return true;
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 29 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class GenMRRedSink3 method process.

/**
   * Reduce Scan encountered.
   *
   * @param nd
   *          the reduce sink operator encountered
   * @param opProcCtx
   *          context
   */
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    ReduceSinkOperator op = (ReduceSinkOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    // union consisted on a bunch of map-reduce jobs, and it has been split at
    // the union
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    UnionOperator union = Utils.findNode(stack, UnionOperator.class);
    assert union != null;
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(union);
    Task<? extends Serializable> unionTask = null;
    if (mapredCtx != null) {
        unionTask = mapredCtx.getCurrTask();
    } else {
        unionTask = ctx.getCurrTask();
    }
    MapredWork plan = (MapredWork) unionTask.getWork();
    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
    Task<? extends Serializable> reducerTask = opTaskMap.get(reducer);
    ctx.setCurrTask(unionTask);
    // If the plan for this reducer does not exist, initialize the plan
    if (reducerTask == null) {
        // When the reducer is encountered for the first time
        if (plan.getReduceWork() == null) {
            GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
        // When union is followed by a multi-table insert
        } else {
            GenMapRedUtils.splitPlan(op, ctx);
        }
    } else if (plan.getReduceWork() != null && plan.getReduceWork().getReducer() == reducer) {
        // The union is already initialized. However, the union is walked from
        // another input
        // initUnionPlan is idempotent
        GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
    } else {
        GenMapRedUtils.joinUnionPlan(ctx, union, unionTask, reducerTask, false);
        ctx.setCurrTask(reducerTask);
    }
    mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
    // the union operator has been processed
    ctx.setCurrUnionOp(null);
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 30 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class AbstractSMBJoinProc method convertBucketMapJoinToSMBJoin.

// Convert the bucket map-join operator to a sort-merge map join operator
protected SMBMapJoinOperator convertBucketMapJoinToSMBJoin(MapJoinOperator mapJoinOp, SortBucketJoinProcCtx smbJoinContext) {
    String[] srcs = smbJoinContext.getSrcs();
    SMBMapJoinOperator smbJop = new SMBMapJoinOperator(mapJoinOp);
    SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
    smbJop.setConf(smbJoinDesc);
    HashMap<Byte, String> tagToAlias = new HashMap<Byte, String>();
    for (int i = 0; i < srcs.length; i++) {
        tagToAlias.put((byte) i, srcs[i]);
    }
    smbJoinDesc.setTagToAlias(tagToAlias);
    int indexInListMapJoinNoReducer = this.pGraphContext.getListMapJoinOpsNoReducer().indexOf(mapJoinOp);
    if (indexInListMapJoinNoReducer >= 0) {
        this.pGraphContext.getListMapJoinOpsNoReducer().remove(indexInListMapJoinNoReducer);
        this.pGraphContext.getListMapJoinOpsNoReducer().add(indexInListMapJoinNoReducer, smbJop);
    }
    Map<String, DummyStoreOperator> aliasToSink = new HashMap<String, DummyStoreOperator>();
    // For all parents (other than the big table), insert a dummy store operator
    /* Consider a query like:
     *
     * select * from
     *   (subq1 --> has a filter)
     *   join
     *   (subq2 --> has a filter)
     * on some key
     *
     * Let us assume that subq1 is the small table (either specified by the user or inferred
     * automatically). The following operator tree will be created:
     *
     * TableScan (subq1) --> Select --> Filter --> DummyStore
     *                                                         \
     *                                                          \     SMBJoin
     *                                                          /
     *                                                         /
     * TableScan (subq2) --> Select --> Filter
     */
    List<Operator<? extends OperatorDesc>> parentOperators = mapJoinOp.getParentOperators();
    for (int i = 0; i < parentOperators.size(); i++) {
        Operator<? extends OperatorDesc> par = parentOperators.get(i);
        int index = par.getChildOperators().indexOf(mapJoinOp);
        par.getChildOperators().remove(index);
        if (i == smbJoinDesc.getPosBigTable()) {
            par.getChildOperators().add(index, smbJop);
        } else {
            DummyStoreOperator dummyStoreOp = new DummyStoreOperator(par.getCompilationOpContext());
            par.getChildOperators().add(index, dummyStoreOp);
            List<Operator<? extends OperatorDesc>> childrenOps = new ArrayList<Operator<? extends OperatorDesc>>();
            childrenOps.add(smbJop);
            dummyStoreOp.setChildOperators(childrenOps);
            List<Operator<? extends OperatorDesc>> parentOps = new ArrayList<Operator<? extends OperatorDesc>>();
            parentOps.add(par);
            dummyStoreOp.setParentOperators(parentOps);
            aliasToSink.put(srcs[i], dummyStoreOp);
            smbJop.getParentOperators().remove(i);
            smbJop.getParentOperators().add(i, dummyStoreOp);
        }
    }
    smbJoinDesc.setAliasToSink(aliasToSink);
    List<Operator<? extends OperatorDesc>> childOps = mapJoinOp.getChildOperators();
    for (int i = 0; i < childOps.size(); i++) {
        Operator<? extends OperatorDesc> child = childOps.get(i);
        int index = child.getParentOperators().indexOf(mapJoinOp);
        child.getParentOperators().remove(index);
        child.getParentOperators().add(index, smbJop);
    }
    // Data structures coming from QBJoinTree
    smbJop.getConf().setQBJoinTreeProps(mapJoinOp.getConf());
    //
    pGraphContext.getSmbMapJoinOps().add(smbJop);
    pGraphContext.getMapJoinOps().remove(mapJoinOp);
    return smbJop;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) SMBJoinDesc(org.apache.hadoop.hive.ql.plan.SMBJoinDesc) HashMap(java.util.HashMap) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) ArrayList(java.util.ArrayList) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Aggregations

OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)78 Operator (org.apache.hadoop.hive.ql.exec.Operator)65 ArrayList (java.util.ArrayList)47 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)41 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)36 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)32 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)30 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)29 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)22 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)21 Path (org.apache.hadoop.fs.Path)20 LinkedHashMap (java.util.LinkedHashMap)18 HashMap (java.util.HashMap)16 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)16 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)16 Serializable (java.io.Serializable)15 Task (org.apache.hadoop.hive.ql.exec.Task)15 List (java.util.List)14 Map (java.util.Map)13 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)13