Search in sources :

Example 1 with MoveTask

use of org.apache.hadoop.hive.ql.exec.MoveTask in project hive by apache.

the class GenMapRedUtils method isMergeRequired.

/**
   * Returns true iff the fsOp requires a merge
   * @param mvTasks
   * @param hconf
   * @param fsOp
   * @param currTask
   * @param isInsertTable
   * @return
   */
public static boolean isMergeRequired(List<Task<MoveWork>> mvTasks, HiveConf hconf, FileSinkOperator fsOp, Task<? extends Serializable> currTask, boolean isInsertTable) {
    // Has the user enabled merging of files for map-only jobs or for all jobs
    if ((mvTasks != null) && (!mvTasks.isEmpty())) {
        // no need of merging if the move is to a local file system
        MoveTask mvTask = (MoveTask) GenMapRedUtils.findMoveTask(mvTasks, fsOp);
        if (mvTask != null && isInsertTable && hconf.getBoolVar(ConfVars.HIVESTATSAUTOGATHER) && !fsOp.getConf().isMaterialization()) {
            // mark the MapredWork and FileSinkOperator for gathering stats
            fsOp.getConf().setGatherStats(true);
            fsOp.getConf().setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE));
            if (!mvTask.hasFollowingStatsTask()) {
                GenMapRedUtils.addStatsTask(fsOp, mvTask, currTask, hconf);
            }
        }
        if ((mvTask != null) && !mvTask.isLocal() && fsOp.getConf().canBeMerged()) {
            if (currTask.getWork() instanceof TezWork) {
                // config
                return hconf.getBoolVar(ConfVars.HIVEMERGETEZFILES);
            } else if (currTask.getWork() instanceof SparkWork) {
                // spark has its own config for merging
                return hconf.getBoolVar(ConfVars.HIVEMERGESPARKFILES);
            }
            if (fsOp.getConf().isLinkedFileSink()) {
                // possibly by a big margin. So, merge aggresively.
                if (hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) || hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES)) {
                    return true;
                }
            } else {
                // or for a map-reduce job
                if (currTask.getWork() instanceof MapredWork) {
                    ReduceWork reduceWork = ((MapredWork) currTask.getWork()).getReduceWork();
                    boolean mergeMapOnly = hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) && reduceWork == null;
                    boolean mergeMapRed = hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES) && reduceWork != null;
                    if (mergeMapOnly || mergeMapRed) {
                        return true;
                    }
                } else {
                    return false;
                }
            }
        }
    }
    return false;
}
Also used : MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Example 2 with MoveTask

use of org.apache.hadoop.hive.ql.exec.MoveTask in project hive by apache.

the class GenMapRedUtils method createCondTask.

/**
   * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
   *
   * @param conf
   *          HiveConf
   * @param currTask
   *          current leaf task
   * @param dummyMoveWork
   *          MoveWork for the move task
   * @param mergeWork
   *          MapredWork for the merge task.
   * @param condInputPath
   *          the input directory of the merge/move task
   * @param condOutputPath
   *          the output directory of the merge/move task
   * @param moveTaskToLink
   *          a MoveTask that may be linked to the conditional sub-tasks
   * @param dependencyTask
   *          a dependency task that may be linked to the conditional sub-tasks
   * @return The conditional task
   */
private static ConditionalTask createCondTask(HiveConf conf, Task<? extends Serializable> currTask, MoveWork dummyMoveWork, Serializable mergeWork, Path condInputPath, Path condOutputPath, Task<MoveWork> moveTaskToLink, DependencyCollectionTask dependencyTask) {
    boolean shouldMergeMovePaths = (moveTaskToLink != null && dependencyTask == null && shouldMergeMovePaths(conf, condInputPath, condOutputPath, moveTaskToLink.getWork()));
    MoveWork workForMoveOnlyTask;
    if (shouldMergeMovePaths) {
        workForMoveOnlyTask = mergeMovePaths(condInputPath, moveTaskToLink.getWork());
    } else {
        workForMoveOnlyTask = dummyMoveWork;
    }
    // There are 3 options for this ConditionalTask:
    // 1) Merge the partitions
    // 2) Move the partitions (i.e. don't merge the partitions)
    // 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't
    // merge others) in this case the merge is done first followed by the move to prevent
    // conflicts.
    Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(workForMoveOnlyTask, conf);
    Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(dummyMoveWork, conf);
    // NOTE! It is necessary merge task is the parent of the move task, and not
    // the other way around, for the proper execution of the execute method of
    // ConditionalTask
    mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask);
    List<Serializable> listWorks = new ArrayList<Serializable>();
    listWorks.add(workForMoveOnlyTask);
    listWorks.add(mergeWork);
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    listTasks.add(moveOnlyMoveTask);
    listTasks.add(mergeOnlyMergeTask);
    listTasks.add(mergeAndMoveMergeTask);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf);
    cndTsk.setListTasks(listTasks);
    // create resolver
    cndTsk.setResolver(new ConditionalResolverMergeFiles());
    ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, condInputPath.toString());
    cndTsk.setResolverCtx(mrCtx);
    // make the conditional task as the child of the current leaf task
    currTask.addDependentTask(cndTsk);
    if (shouldMergeMovePaths) {
        // If a new MoveWork was created, then we should link all dependent tasks from the MoveWork to link.
        if (moveTaskToLink.getDependentTasks() != null) {
            for (Task dependentTask : moveTaskToLink.getDependentTasks()) {
                moveOnlyMoveTask.addDependentTask(dependentTask);
            }
        }
    } else {
        addDependentMoveTasks(moveTaskToLink, conf, moveOnlyMoveTask, dependencyTask);
    }
    addDependentMoveTasks(moveTaskToLink, conf, mergeOnlyMergeTask, dependencyTask);
    addDependentMoveTasks(moveTaskToLink, conf, mergeAndMoveMoveTask, dependencyTask);
    return cndTsk;
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalResolverMergeFiles(org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles) Serializable(java.io.Serializable) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) ConditionalResolverMergeFilesCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx)

Aggregations

MoveTask (org.apache.hadoop.hive.ql.exec.MoveTask)2 Serializable (java.io.Serializable)1 ArrayList (java.util.ArrayList)1 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)1 DependencyCollectionTask (org.apache.hadoop.hive.ql.exec.DependencyCollectionTask)1 Task (org.apache.hadoop.hive.ql.exec.Task)1 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)1 SparkTask (org.apache.hadoop.hive.ql.exec.spark.SparkTask)1 ConditionalResolverMergeFiles (org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles)1 ConditionalResolverMergeFilesCtx (org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx)1 ConditionalWork (org.apache.hadoop.hive.ql.plan.ConditionalWork)1 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)1 MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)1 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)1 SparkWork (org.apache.hadoop.hive.ql.plan.SparkWork)1 TezWork (org.apache.hadoop.hive.ql.plan.TezWork)1