Search in sources :

Example 6 with ConditionalTask

use of org.apache.hadoop.hive.ql.exec.ConditionalTask in project hive by apache.

the class GenMapRedUtils method createMRWorkForMergingFiles.

/**
 * @param fsInput The FileSink operator.
 * @param finalName the final destination path the merge job should output.
 * @param dependencyTask
 * @param mvTasks
 * @param conf
 * @param currTask
 * @param lineageState
 * @throws SemanticException
 *
 * create a Map-only merge job using CombineHiveInputFormat for all partitions with
 * following operators:
 *          MR job J0:
 *          ...
 *          |
 *          v
 *          FileSinkOperator_1 (fsInput)
 *          |
 *          v
 *          Merge job J1:
 *          |
 *          v
 *          TableScan (using CombineHiveInputFormat) (tsMerge)
 *          |
 *          v
 *          FileSinkOperator (fsMerge)
 *
 *          Here the pathToPartitionInfo & pathToAlias will remain the same, which means the paths
 *          do
 *          not contain the dynamic partitions (their parent). So after the dynamic partitions are
 *          created (after the first job finished before the moveTask or ConditionalTask start),
 *          we need to change the pathToPartitionInfo & pathToAlias to include the dynamic
 *          partition
 *          directories.
 */
public static void createMRWorkForMergingFiles(FileSinkOperator fsInput, Path finalName, DependencyCollectionTask dependencyTask, List<Task<MoveWork>> mvTasks, HiveConf conf, Task<? extends Serializable> currTask, LineageState lineageState) throws SemanticException {
    // 
    // 1. create the operator tree
    // 
    FileSinkDesc fsInputDesc = fsInput.getConf();
    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
        Utilities.FILE_OP_LOGGER.trace("Creating merge work from " + System.identityHashCode(fsInput) + " with write ID " + (fsInputDesc.isMmTable() ? fsInputDesc.getTableWriteId() : null) + " into " + finalName);
    }
    boolean isBlockMerge = (conf.getBoolVar(ConfVars.HIVEMERGERCFILEBLOCKLEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) || (conf.getBoolVar(ConfVars.HIVEMERGEORCFILESTRIPELEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class));
    RowSchema inputRS = fsInput.getSchema();
    Long srcMmWriteId = fsInputDesc.isMmTable() ? fsInputDesc.getTableWriteId() : null;
    FileSinkDesc fsOutputDesc = null;
    TableScanOperator tsMerge = null;
    if (!isBlockMerge) {
        // Create a TableScan operator
        tsMerge = GenMapRedUtils.createTemporaryTableScanOperator(fsInput.getCompilationOpContext(), inputRS);
        // Create a FileSink operator
        TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
        Path mergeDest = srcMmWriteId == null ? finalName : finalName.getParent();
        fsOutputDesc = new FileSinkDesc(mergeDest, ts, conf.getBoolVar(ConfVars.COMPRESSRESULT));
        fsOutputDesc.setMmWriteId(srcMmWriteId);
        fsOutputDesc.setIsMerge(true);
        // Create and attach the filesink for the merge.
        OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge);
    }
    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
    // needs to include the partition column, and the fsOutput should have
    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
    DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx();
    if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
        // adding DP ColumnInfo to the RowSchema signature
        ArrayList<ColumnInfo> signature = inputRS.getSignature();
        String tblAlias = fsInputDesc.getTableInfo().getTableName();
        for (String dpCol : dpCtx.getDPColNames()) {
            ColumnInfo colInfo = new ColumnInfo(dpCol, // all partition column type should be string
            TypeInfoFactory.stringTypeInfo, tblAlias, // partition column is virtual column
            true);
            signature.add(colInfo);
        }
        inputRS.setSignature(signature);
        if (!isBlockMerge) {
            // create another DynamicPartitionCtx, which has a different input-to-DP column mapping
            DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx);
            fsOutputDesc.setDynPartCtx(dpCtx2);
        }
        // update the FileSinkOperator to include partition columns
        usePartitionColumns(fsInputDesc.getTableInfo().getProperties(), dpCtx.getDPColNames());
    } else {
        // non-partitioned table
        fsInputDesc.getTableInfo().getProperties().remove(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
    }
    // 
    // 2. Constructing a conditional task consisting of a move task and a map reduce task
    // 
    Path inputDirName = fsInputDesc.getMergeInputDirName();
    MapWork cplan;
    Serializable work;
    if (isBlockMerge) {
        cplan = GenMapRedUtils.createMergeTask(fsInputDesc, finalName, dpCtx != null && dpCtx.getNumDPCols() > 0, fsInput.getCompilationOpContext());
        if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
            work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
            cplan.setName("File Merge");
            ((TezWork) work).add(cplan);
        } else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
            work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
            cplan.setName("Spark Merge File Work");
            ((SparkWork) work).add(cplan);
        } else {
            work = cplan;
        }
    } else {
        cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc);
        if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
            work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
            cplan.setName("File Merge");
            ((TezWork) work).add(cplan);
        } else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
            work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
            cplan.setName("Spark Merge File Work");
            ((SparkWork) work).add(cplan);
        } else {
            work = new MapredWork();
            ((MapredWork) work).setMapWork(cplan);
        }
    }
    // use CombineHiveInputFormat for map-only merging
    cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
    // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
    // know if merge MR2 will be triggered at execution time
    MoveWork dummyMv = null;
    if (srcMmWriteId == null) {
        // Only create the movework for non-MM table. No action needed for a MM table.
        dummyMv = new MoveWork(null, null, null, new LoadFileDesc(inputDirName, finalName, true, null, null, false), false);
    }
    // Use the original fsOp path here in case of MM - while the new FSOP merges files inside the
    // MM directory, the original MoveTask still commits based on the parent. Note that this path
    // can only be triggered for a merge that's part of insert for now; MM tables do not support
    // concatenate. Keeping the old logic for non-MM tables with temp directories and stuff.
    Path fsopPath = srcMmWriteId != null ? fsInputDesc.getFinalDirName() : finalName;
    Task<MoveWork> mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fsopPath, fsInputDesc.isMmTable());
    ConditionalTask cndTsk = GenMapRedUtils.createCondTask(conf, currTask, dummyMv, work, fsInputDesc.getMergeInputDirName(), finalName, mvTask, dependencyTask, lineageState);
    // keep the dynamic partition context in conditional task resolver context
    ConditionalResolverMergeFilesCtx mrCtx = (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx();
    mrCtx.setDPCtx(fsInputDesc.getDynPartCtx());
    mrCtx.setLbCtx(fsInputDesc.getLbCtx());
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Serializable(java.io.Serializable) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) DynamicPartitionCtx(org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ConditionalResolverMergeFilesCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) RCFileInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Example 7 with ConditionalTask

use of org.apache.hadoop.hive.ql.exec.ConditionalTask in project hive by apache.

the class GenMapRedUtils method createCondTask.

/**
 * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
 *
 * @param conf
 *          HiveConf
 * @param currTask
 *          current leaf task
 * @param dummyMoveWork
 *          MoveWork for the move task
 * @param mergeWork
 *          MapredWork for the merge task.
 * @param condInputPath
 *          the input directory of the merge/move task
 * @param condOutputPath
 *          the output directory of the merge/move task
 * @param moveTaskToLink
 *          a MoveTask that may be linked to the conditional sub-tasks
 * @param dependencyTask
 *          a dependency task that may be linked to the conditional sub-tasks
 * @param lineageState
 *          to track activity
 * @return The conditional task
 */
@SuppressWarnings("unchecked")
private static ConditionalTask createCondTask(HiveConf conf, Task<? extends Serializable> currTask, MoveWork mvWork, Serializable mergeWork, Path condInputPath, Path condOutputPath, Task<MoveWork> moveTaskToLink, DependencyCollectionTask dependencyTask, LineageState lineageState) {
    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
        Utilities.FILE_OP_LOGGER.trace("Creating conditional merge task for " + condInputPath);
    }
    // Create a dummy task if no move is needed.
    Serializable moveWork = mvWork != null ? mvWork : new DependencyCollectionWork();
    // Note: this should never happen for mm tables.
    boolean shouldMergeMovePaths = (moveTaskToLink != null && dependencyTask == null && shouldMergeMovePaths(conf, condInputPath, condOutputPath, moveTaskToLink.getWork()));
    Serializable workForMoveOnlyTask = moveWork;
    if (shouldMergeMovePaths) {
        workForMoveOnlyTask = mergeMovePaths(condInputPath, moveTaskToLink.getWork(), lineageState);
    }
    // There are 3 options for this ConditionalTask:
    // 1) Merge the partitions
    // 2) Move the partitions (i.e. don't merge the partitions)
    // 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't
    // merge others) in this case the merge is done first followed by the move to prevent
    // conflicts.
    // TODO: if we are not dealing with concatenate DDL, we should not create a merge+move path
    // because it should be impossible to get incompatible outputs.
    Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork);
    Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(workForMoveOnlyTask);
    Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork);
    Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(moveWork);
    // NOTE! It is necessary merge task is the parent of the move task, and not
    // the other way around, for the proper execution of the execute method of
    // ConditionalTask
    mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask);
    List<Serializable> listWorks = new ArrayList<Serializable>();
    listWorks.add(workForMoveOnlyTask);
    listWorks.add(mergeWork);
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    listTasks.add(moveOnlyMoveTask);
    listTasks.add(mergeOnlyMergeTask);
    listTasks.add(mergeAndMoveMergeTask);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
    cndTsk.setListTasks(listTasks);
    // create resolver
    cndTsk.setResolver(new ConditionalResolverMergeFiles());
    ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, condInputPath.toString());
    cndTsk.setResolverCtx(mrCtx);
    // make the conditional task as the child of the current leaf task
    currTask.addDependentTask(cndTsk);
    if (shouldMergeMovePaths) {
        // If a new MoveWork was created, then we should link all dependent tasks from the MoveWork to link.
        if (moveTaskToLink.getDependentTasks() != null) {
            for (Task dependentTask : moveTaskToLink.getDependentTasks()) {
                moveOnlyMoveTask.addDependentTask(dependentTask);
            }
        }
    } else {
        addDependentMoveTasks(moveTaskToLink, conf, moveOnlyMoveTask, dependencyTask);
    }
    addDependentMoveTasks(moveTaskToLink, conf, mergeOnlyMergeTask, dependencyTask);
    addDependentMoveTasks(moveTaskToLink, conf, mergeAndMoveMoveTask, dependencyTask);
    return cndTsk;
}
Also used : ConditionalResolverMergeFiles(org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles) Serializable(java.io.Serializable) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) DependencyCollectionWork(org.apache.hadoop.hive.ql.plan.DependencyCollectionWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) ConditionalResolverMergeFilesCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx)

Example 8 with ConditionalTask

use of org.apache.hadoop.hive.ql.exec.ConditionalTask in project hive by apache.

the class Executor method launchTask.

private TaskRunner launchTask(Task<?> task, boolean noName, String jobName, int jobCount) throws HiveException {
    SessionState.get().getHiveHistory().startTask(driverContext.getQueryId(), task, task.getClass().getName());
    if (task.isMapRedTask() && !(task instanceof ConditionalTask)) {
        if (noName) {
            driverContext.getConf().set(MRJobConfig.JOB_NAME, jobName + " (" + task.getId() + ")");
        }
        taskQueue.incCurJobNo(1);
        CONSOLE.printInfo("Launching Job " + taskQueue.getCurJobNo() + " out of " + jobCount);
    }
    task.initialize(driverContext.getQueryState(), driverContext.getPlan(), taskQueue, context);
    TaskRunner taskRun = new TaskRunner(task, taskQueue);
    taskQueue.launching(taskRun);
    if (HiveConf.getBoolVar(task.getConf(), HiveConf.ConfVars.EXECPARALLEL) && task.canExecuteInParallel()) {
        LOG.info("Starting task [" + task + "] in parallel");
        taskRun.start();
    } else {
        LOG.info("Starting task [" + task + "] in serial mode");
        taskRun.runSequential();
    }
    return taskRun;
}
Also used : ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) TaskRunner(org.apache.hadoop.hive.ql.exec.TaskRunner)

Example 9 with ConditionalTask

use of org.apache.hadoop.hive.ql.exec.ConditionalTask in project hive by apache.

the class QueryPlan method populateQueryPlan.

/**
 * Populate api.QueryPlan from exec structures. This includes constructing the
 * dependency graphs of stages and operators.
 *
 * @throws IOException
 */
private void populateQueryPlan() throws IOException {
    query.setStageGraph(new org.apache.hadoop.hive.ql.plan.api.Graph());
    query.getStageGraph().setNodeType(NodeType.STAGE);
    Queue<Task<?>> tasksToVisit = new LinkedList<Task<?>>();
    Set<Task<?>> tasksVisited = new HashSet<Task<?>>();
    tasksToVisit.addAll(rootTasks);
    while (tasksToVisit.size() != 0) {
        Task<?> task = tasksToVisit.remove();
        tasksVisited.add(task);
        // populate stage
        org.apache.hadoop.hive.ql.plan.api.Stage stage = new org.apache.hadoop.hive.ql.plan.api.Stage();
        stage.setStageId(task.getId());
        stage.setStageType(task.getType());
        query.addToStageList(stage);
        if (task instanceof ExecDriver) {
            // populate map task
            ExecDriver mrTask = (ExecDriver) task;
            org.apache.hadoop.hive.ql.plan.api.Task mapTask = new org.apache.hadoop.hive.ql.plan.api.Task();
            mapTask.setTaskId(stage.getStageId() + "_MAP");
            mapTask.setTaskType(TaskType.MAP);
            stage.addToTaskList(mapTask);
            populateOperatorGraph(mapTask, mrTask.getWork().getMapWork().getAliasToWork().values());
            // populate reduce task
            if (mrTask.hasReduce()) {
                org.apache.hadoop.hive.ql.plan.api.Task reduceTask = new org.apache.hadoop.hive.ql.plan.api.Task();
                reduceTask.setTaskId(stage.getStageId() + "_REDUCE");
                reduceTask.setTaskType(TaskType.REDUCE);
                stage.addToTaskList(reduceTask);
                Collection<Operator<? extends OperatorDesc>> reducerTopOps = new ArrayList<Operator<? extends OperatorDesc>>();
                reducerTopOps.add(mrTask.getWork().getReduceWork().getReducer());
                populateOperatorGraph(reduceTask, reducerTopOps);
            }
        } else {
            org.apache.hadoop.hive.ql.plan.api.Task otherTask = new org.apache.hadoop.hive.ql.plan.api.Task();
            otherTask.setTaskId(stage.getStageId() + "_OTHER");
            otherTask.setTaskType(TaskType.OTHER);
            stage.addToTaskList(otherTask);
        }
        if (task instanceof ConditionalTask) {
            org.apache.hadoop.hive.ql.plan.api.Adjacency listEntry = new org.apache.hadoop.hive.ql.plan.api.Adjacency();
            listEntry.setAdjacencyType(AdjacencyType.DISJUNCTIVE);
            listEntry.setNode(task.getId());
            ConditionalTask t = (ConditionalTask) task;
            for (Task<?> listTask : t.getListTasks()) {
                if (t.getChildTasks() != null) {
                    org.apache.hadoop.hive.ql.plan.api.Adjacency childEntry = new org.apache.hadoop.hive.ql.plan.api.Adjacency();
                    childEntry.setAdjacencyType(AdjacencyType.DISJUNCTIVE);
                    childEntry.setNode(listTask.getId());
                    // done processing the task
                    for (Task<?> childTask : t.getChildTasks()) {
                        childEntry.addToChildren(childTask.getId());
                        if (!tasksVisited.contains(childTask)) {
                            tasksToVisit.add(childTask);
                        }
                    }
                    query.getStageGraph().addToAdjacencyList(childEntry);
                }
                listEntry.addToChildren(listTask.getId());
                if (!tasksVisited.contains(listTask)) {
                    tasksToVisit.add(listTask);
                }
            }
            query.getStageGraph().addToAdjacencyList(listEntry);
        } else if (task.getChildTasks() != null) {
            org.apache.hadoop.hive.ql.plan.api.Adjacency entry = new org.apache.hadoop.hive.ql.plan.api.Adjacency();
            entry.setAdjacencyType(AdjacencyType.CONJUNCTIVE);
            entry.setNode(task.getId());
            // done processing the task
            for (Task<?> childTask : task.getChildTasks()) {
                entry.addToChildren(childTask.getId());
                if (!tasksVisited.contains(childTask)) {
                    tasksToVisit.add(childTask);
                }
            }
            query.getStageGraph().addToAdjacencyList(entry);
        }
    }
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) ExplainTask(org.apache.hadoop.hive.ql.exec.ExplainTask) ArrayList(java.util.ArrayList) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) HashSet(java.util.HashSet) LinkedList(java.util.LinkedList) ExecDriver(org.apache.hadoop.hive.ql.exec.mr.ExecDriver) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 10 with ConditionalTask

use of org.apache.hadoop.hive.ql.exec.ConditionalTask in project hive by apache.

the class GenMapRedUtils method createMRWorkForMergingFiles.

/**
 * @param fsInput The FileSink operator.
 * @param finalName the final destination path the merge job should output.
 * @param dependencyTask
 * @param mvTasks
 * @param conf
 * @param currTask
 * @param lineageState
 * @throws SemanticException
 *
 * create a Map-only merge job using CombineHiveInputFormat for all partitions with
 * following operators:
 *          MR job J0:
 *          ...
 *          |
 *          v
 *          FileSinkOperator_1 (fsInput)
 *          |
 *          v
 *          Merge job J1:
 *          |
 *          v
 *          TableScan (using CombineHiveInputFormat) (tsMerge)
 *          |
 *          v
 *          FileSinkOperator (fsMerge)
 *
 *          Here the pathToPartitionInfo &amp; pathToAlias will remain the same, which means the paths
 *          do
 *          not contain the dynamic partitions (their parent). So after the dynamic partitions are
 *          created (after the first job finished before the moveTask or ConditionalTask start),
 *          we need to change the pathToPartitionInfo &amp; pathToAlias to include the dynamic
 *          partition
 *          directories.
 */
public static void createMRWorkForMergingFiles(FileSinkOperator fsInput, Path finalName, DependencyCollectionTask dependencyTask, List<Task<MoveWork>> mvTasks, HiveConf conf, Task<?> currTask, LineageState lineageState) throws SemanticException {
    // 
    // 1. create the operator tree
    // 
    FileSinkDesc fsInputDesc = fsInput.getConf();
    if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
        Utilities.FILE_OP_LOGGER.trace("Creating merge work from " + System.identityHashCode(fsInput) + " with write ID " + (fsInputDesc.isMmTable() ? fsInputDesc.getTableWriteId() : null) + " into " + finalName);
    }
    boolean isBlockMerge = (conf.getBoolVar(ConfVars.HIVEMERGERCFILEBLOCKLEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(RCFileInputFormat.class)) || (conf.getBoolVar(ConfVars.HIVEMERGEORCFILESTRIPELEVEL) && fsInputDesc.getTableInfo().getInputFileFormatClass().equals(OrcInputFormat.class));
    RowSchema inputRS = fsInput.getSchema();
    Long srcMmWriteId = fsInputDesc.isMmTable() ? fsInputDesc.getTableWriteId() : null;
    FileSinkDesc fsOutputDesc = null;
    TableScanOperator tsMerge = null;
    if (!isBlockMerge) {
        // Create a TableScan operator
        tsMerge = GenMapRedUtils.createTemporaryTableScanOperator(fsInput.getCompilationOpContext(), inputRS);
        // Create a FileSink operator
        TableDesc ts = (TableDesc) fsInputDesc.getTableInfo().clone();
        Path mergeDest = srcMmWriteId == null ? finalName : finalName.getParent();
        fsOutputDesc = new FileSinkDesc(mergeDest, ts, conf.getBoolVar(ConfVars.COMPRESSRESULT));
        fsOutputDesc.setMmWriteId(srcMmWriteId);
        fsOutputDesc.setIsMerge(true);
        // Create and attach the filesink for the merge.
        OperatorFactory.getAndMakeChild(fsOutputDesc, inputRS, tsMerge);
    }
    // If the input FileSinkOperator is a dynamic partition enabled, the tsMerge input schema
    // needs to include the partition column, and the fsOutput should have
    // a DynamicPartitionCtx to indicate that it needs to dynamically partitioned.
    DynamicPartitionCtx dpCtx = fsInputDesc.getDynPartCtx();
    if (dpCtx != null && dpCtx.getNumDPCols() > 0) {
        // adding DP ColumnInfo to the RowSchema signature
        List<ColumnInfo> signature = inputRS.getSignature();
        String tblAlias = fsInputDesc.getTableInfo().getTableName();
        for (String dpCol : dpCtx.getDPColNames()) {
            ColumnInfo colInfo = new ColumnInfo(dpCol, // all partition column type should be string
            TypeInfoFactory.stringTypeInfo, tblAlias, // partition column is virtual column
            true);
            signature.add(colInfo);
        }
        inputRS.setSignature(signature);
        if (!isBlockMerge) {
            // create another DynamicPartitionCtx, which has a different input-to-DP column mapping
            DynamicPartitionCtx dpCtx2 = new DynamicPartitionCtx(dpCtx);
            fsOutputDesc.setDynPartCtx(dpCtx2);
        }
        // update the FileSinkOperator to include partition columns
        usePartitionColumns(fsInputDesc.getTableInfo().getProperties(), fsInputDesc.getTable(), dpCtx.getDPColNames());
    } else {
        // non-partitioned table
        fsInputDesc.getTableInfo().getProperties().remove(org.apache.hadoop.hive.metastore.api.hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS);
    }
    // 
    // 2. Constructing a conditional task consisting of a move task and a map reduce task
    // 
    Path inputDirName = fsInputDesc.getMergeInputDirName();
    MapWork cplan;
    Serializable work;
    if (isBlockMerge) {
        cplan = GenMapRedUtils.createMergeTask(fsInputDesc, finalName, dpCtx != null && dpCtx.getNumDPCols() > 0, fsInput.getCompilationOpContext());
        if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
            work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
            cplan.setName("File Merge");
            ((TezWork) work).add(cplan);
        } else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
            work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
            cplan.setName("Spark Merge File Work");
            ((SparkWork) work).add(cplan);
        } else {
            work = cplan;
        }
    } else {
        cplan = createMRWorkForMergingFiles(conf, tsMerge, fsInputDesc);
        if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("tez")) {
            work = new TezWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID), conf);
            cplan.setName("File Merge");
            ((TezWork) work).add(cplan);
        } else if (conf.getVar(ConfVars.HIVE_EXECUTION_ENGINE).equals("spark")) {
            work = new SparkWork(conf.getVar(HiveConf.ConfVars.HIVEQUERYID));
            cplan.setName("Spark Merge File Work");
            ((SparkWork) work).add(cplan);
        } else {
            work = new MapredWork();
            ((MapredWork) work).setMapWork(cplan);
        }
    }
    // use CombineHiveInputFormat for map-only merging
    cplan.setInputformat("org.apache.hadoop.hive.ql.io.CombineHiveInputFormat");
    // NOTE: we should gather stats in MR1 rather than MR2 at merge job since we don't
    // know if merge MR2 will be triggered at execution time
    MoveWork dummyMv = null;
    if (srcMmWriteId == null) {
        // Only create the movework for non-MM table. No action needed for a MM table.
        dummyMv = new MoveWork(null, null, null, new LoadFileDesc(inputDirName, finalName, true, null, null, false), false);
    }
    // Use the original fsOp path here in case of MM - while the new FSOP merges files inside the
    // MM directory, the original MoveTask still commits based on the parent. Note that this path
    // can only be triggered for a merge that's part of insert for now; MM tables do not support
    // concatenate. Keeping the old logic for non-MM tables with temp directories and stuff.
    Path fsopPath = srcMmWriteId != null ? fsInputDesc.getFinalDirName() : finalName;
    Task<MoveWork> mvTask = GenMapRedUtils.findMoveTaskForFsopOutput(mvTasks, fsopPath, fsInputDesc.isMmTable(), fsInputDesc.isDirectInsert(), fsInputDesc.getMoveTaskId(), fsInputDesc.getAcidOperation());
    ConditionalTask cndTsk = GenMapRedUtils.createCondTask(conf, currTask, dummyMv, work, fsInputDesc.getMergeInputDirName(), finalName, mvTask, dependencyTask, lineageState);
    // keep the dynamic partition context in conditional task resolver context
    ConditionalResolverMergeFilesCtx mrCtx = (ConditionalResolverMergeFilesCtx) cndTsk.getResolverCtx();
    mrCtx.setDPCtx(fsInputDesc.getDynPartCtx());
    mrCtx.setLbCtx(fsInputDesc.getLbCtx());
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Serializable(java.io.Serializable) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) FileSinkDesc(org.apache.hadoop.hive.ql.plan.FileSinkDesc) DynamicPartitionCtx(org.apache.hadoop.hive.ql.plan.DynamicPartitionCtx) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ConditionalResolverMergeFilesCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) RCFileInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Aggregations

ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)29 Task (org.apache.hadoop.hive.ql.exec.Task)24 ArrayList (java.util.ArrayList)19 Serializable (java.io.Serializable)16 List (java.util.List)15 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)14 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)13 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)13 Path (org.apache.hadoop.fs.Path)11 Operator (org.apache.hadoop.hive.ql.exec.Operator)11 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)11 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)10 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)9 ConditionalWork (org.apache.hadoop.hive.ql.plan.ConditionalWork)9 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)8 SparkTask (org.apache.hadoop.hive.ql.exec.spark.SparkTask)8 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)8 SparkWork (org.apache.hadoop.hive.ql.plan.SparkWork)8 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)7 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)7