Search in sources :

Example 6 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class MapReduceCompiler method setInputFormat.

// loop over all the tasks recursively
@Override
protected void setInputFormat(Task<? extends Serializable> task) {
    if (task instanceof ExecDriver) {
        MapWork work = ((MapredWork) task.getWork()).getMapWork();
        HashMap<String, Operator<? extends OperatorDesc>> opMap = work.getAliasToWork();
        if (!opMap.isEmpty()) {
            for (Operator<? extends OperatorDesc> op : opMap.values()) {
                setInputFormat(work, op);
            }
        }
    } else if (task instanceof ConditionalTask) {
        List<Task<? extends Serializable>> listTasks = ((ConditionalTask) task).getListTasks();
        for (Task<? extends Serializable> tsk : listTasks) {
            setInputFormat(tsk);
        }
    }
    if (task.getChildTasks() != null) {
        for (Task<? extends Serializable> childTask : task.getChildTasks()) {
            setInputFormat(childTask);
        }
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) GenMROperator(org.apache.hadoop.hive.ql.optimizer.GenMROperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ExecDriver(org.apache.hadoop.hive.ql.exec.mr.ExecDriver) List(java.util.List) ArrayList(java.util.ArrayList) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 7 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class SerializationUtilities method clonePlan.

/**
   * Clones using the powers of XML. Do not use unless necessary.
   * @param plan The plan.
   * @return The clone.
   */
public static MapredWork clonePlan(MapredWork plan) {
    // TODO: need proper clone. Meanwhile, let's at least keep this horror in one place
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.CLONE_PLAN);
    Operator<?> op = plan.getAnyOperator();
    CompilationOpContext ctx = (op == null) ? null : op.getCompilationOpContext();
    ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
    serializePlan(plan, baos, true);
    MapredWork newPlan = deserializePlan(new ByteArrayInputStream(baos.toByteArray()), MapredWork.class, true);
    // Restore the context.
    for (Operator<?> newOp : newPlan.getAllOperators()) {
        newOp.setCompilationOpContext(ctx);
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.CLONE_PLAN);
    return newPlan;
}
Also used : MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ByteArrayInputStream(java.io.ByteArrayInputStream) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ByteArrayOutputStream(java.io.ByteArrayOutputStream)

Example 8 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class GenMRFileSink1 method process.

/**
   * File Sink Operator encountered.
   *
   * @param nd
   *          the file sink operator encountered
   * @param opProcCtx
   *          context
   */
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    boolean chDir = false;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    ctx.addRootIfPossible(currTask);
    FileSinkOperator fsOp = (FileSinkOperator) nd;
    // is INSERT OVERWRITE TABLE
    boolean isInsertTable = GenMapRedUtils.isInsertInto(parseCtx, fsOp);
    HiveConf hconf = parseCtx.getConf();
    // Mark this task as a final map reduce task (ignoring the optional merge task)
    ((MapredWork) currTask.getWork()).setFinalMapRed(true);
    // If this file sink desc has been processed due to a linked file sink desc,
    // use that task
    Map<FileSinkDesc, Task<? extends Serializable>> fileSinkDescs = ctx.getLinkedFileDescTasks();
    if (fileSinkDescs != null) {
        Task<? extends Serializable> childTask = fileSinkDescs.get(fsOp.getConf());
        processLinkedFileDesc(ctx, childTask);
        return true;
    }
    // So, no need to attempt to merge the files again.
    if ((ctx.getSeenFileSinkOps() == null) || (!ctx.getSeenFileSinkOps().contains(nd))) {
        chDir = GenMapRedUtils.isMergeRequired(ctx.getMvTask(), hconf, fsOp, currTask, isInsertTable);
    }
    Path finalName = processFS(fsOp, stack, opProcCtx, chDir);
    if (chDir) {
        // Merge the files in the destination table/partitions by creating Map-only merge job
        // If underlying data is RCFile or OrcFile, RCFileBlockMerge task or
        // OrcFileStripeMerge task would be created.
        LOG.info("using CombineHiveInputformat for the merge job");
        GenMapRedUtils.createMRWorkForMergingFiles(fsOp, finalName, ctx.getDependencyTaskForMultiInsert(), ctx.getMvTask(), hconf, currTask);
    }
    FileSinkDesc fileSinkDesc = fsOp.getConf();
    if (fileSinkDesc.isLinkedFileSink()) {
        Map<FileSinkDesc, Task<? extends Serializable>> linkedFileDescTasks = ctx.getLinkedFileDescTasks();
        if (linkedFileDescTasks == null) {
            linkedFileDescTasks = new HashMap<FileSinkDesc, Task<? extends Serializable>>();
            ctx.setLinkedFileDescTasks(linkedFileDescTasks);
        }
        // The child tasks may be null in case of a select
        if ((currTask.getChildTasks() != null) && (currTask.getChildTasks().size() == 1)) {
            for (FileSinkDesc fileDesc : fileSinkDesc.getLinkedFileSinkDesc()) {
                linkedFileDescTasks.put(fileDesc, currTask.getChildTasks().get(0));
            }
        }
    }
    FetchTask fetchTask = parseCtx.getFetchTask();
    if (fetchTask != null && currTask.getNumChild() == 0) {
        if (fetchTask.isFetchFrom(fileSinkDesc)) {
            currTask.setFetchSource(true);
        }
    }
    return true;
}
Also used : Path(org.apache.hadoop.fs.Path) Task(org.apache.hadoop.hive.ql.exec.Task) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) Serializable(java.io.Serializable) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) HiveConf(org.apache.hadoop.hive.conf.HiveConf)

Example 9 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class GenMRRedSink1 method process.

/**
   * Reduce Sink encountered.
   * a) If we are seeing this RS for first time, we initialize plan corresponding to this RS.
   * b) If we are seeing this RS for second or later time then either query had a join in which
   *    case we will merge this plan with earlier plan involving this RS or plan for this RS
   *    needs to be split in two branches.
   *
   * @param nd
   *          the reduce sink operator encountered
   * @param opProcCtx
   *          context
   */
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    ReduceSinkOperator op = (ReduceSinkOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(stack.get(stack.size() - 2));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork currPlan = (MapredWork) currTask.getWork();
    String currAliasId = mapredCtx.getCurrAliasId();
    if (op.getNumChild() != 1) {
        throw new IllegalStateException("Expecting operator " + op + " to have one child. " + "But found multiple children : " + op.getChildOperators());
    }
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    Task<? extends Serializable> oldTask = ctx.getOpTaskMap().get(reducer);
    ctx.setCurrAliasId(currAliasId);
    ctx.setCurrTask(currTask);
    // If the plan for this reducer does not exist, initialize the plan
    if (oldTask == null) {
        if (currPlan.getReduceWork() == null) {
            GenMapRedUtils.initPlan(op, ctx);
        } else {
            GenMapRedUtils.splitPlan(op, ctx);
        }
    } else {
        // This will happen in case of joins. The current plan can be thrown away
        // after being merged with the original plan
        GenMapRedUtils.joinPlan(currTask, oldTask, ctx);
        currTask = oldTask;
        ctx.setCurrTask(currTask);
    }
    mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
    if (GenMapRedUtils.hasBranchFinished(nodeOutputs)) {
        ctx.addRootIfPossible(currTask);
        return false;
    }
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 10 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class GenMRRedSink3 method process.

/**
   * Reduce Scan encountered.
   *
   * @param nd
   *          the reduce sink operator encountered
   * @param opProcCtx
   *          context
   */
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    ReduceSinkOperator op = (ReduceSinkOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    // union consisted on a bunch of map-reduce jobs, and it has been split at
    // the union
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    UnionOperator union = Utils.findNode(stack, UnionOperator.class);
    assert union != null;
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(union);
    Task<? extends Serializable> unionTask = null;
    if (mapredCtx != null) {
        unionTask = mapredCtx.getCurrTask();
    } else {
        unionTask = ctx.getCurrTask();
    }
    MapredWork plan = (MapredWork) unionTask.getWork();
    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = ctx.getOpTaskMap();
    Task<? extends Serializable> reducerTask = opTaskMap.get(reducer);
    ctx.setCurrTask(unionTask);
    // If the plan for this reducer does not exist, initialize the plan
    if (reducerTask == null) {
        // When the reducer is encountered for the first time
        if (plan.getReduceWork() == null) {
            GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
        // When union is followed by a multi-table insert
        } else {
            GenMapRedUtils.splitPlan(op, ctx);
        }
    } else if (plan.getReduceWork() != null && plan.getReduceWork().getReducer() == reducer) {
        // The union is already initialized. However, the union is walked from
        // another input
        // initUnionPlan is idempotent
        GenMapRedUtils.initUnionPlan(op, union, ctx, unionTask);
    } else {
        GenMapRedUtils.joinUnionPlan(ctx, union, unionTask, reducerTask, false);
        ctx.setCurrTask(reducerTask);
    }
    mapCurrCtx.put(op, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
    // the union operator has been processed
    ctx.setCurrUnionOp(null);
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Task(org.apache.hadoop.hive.ql.exec.Task) Serializable(java.io.Serializable) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Aggregations

MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)43 Path (org.apache.hadoop.fs.Path)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)13 Serializable (java.io.Serializable)12 Operator (org.apache.hadoop.hive.ql.exec.Operator)12 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)12 Task (org.apache.hadoop.hive.ql.exec.Task)12 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)12 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)12 ArrayList (java.util.ArrayList)10 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)10 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)10 LinkedHashMap (java.util.LinkedHashMap)9 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)9 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)9 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)8 ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)8 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)7 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)7 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)7