Search in sources :

Example 1 with DependencyCollectionTask

use of org.apache.hadoop.hive.ql.exec.DependencyCollectionTask in project hive by apache.

the class GenMapRedUtils method createCondTask.

/**
   * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
   *
   * @param conf
   *          HiveConf
   * @param currTask
   *          current leaf task
   * @param dummyMoveWork
   *          MoveWork for the move task
   * @param mergeWork
   *          MapredWork for the merge task.
   * @param condInputPath
   *          the input directory of the merge/move task
   * @param condOutputPath
   *          the output directory of the merge/move task
   * @param moveTaskToLink
   *          a MoveTask that may be linked to the conditional sub-tasks
   * @param dependencyTask
   *          a dependency task that may be linked to the conditional sub-tasks
   * @return The conditional task
   */
private static ConditionalTask createCondTask(HiveConf conf, Task<? extends Serializable> currTask, MoveWork dummyMoveWork, Serializable mergeWork, Path condInputPath, Path condOutputPath, Task<MoveWork> moveTaskToLink, DependencyCollectionTask dependencyTask) {
    boolean shouldMergeMovePaths = (moveTaskToLink != null && dependencyTask == null && shouldMergeMovePaths(conf, condInputPath, condOutputPath, moveTaskToLink.getWork()));
    MoveWork workForMoveOnlyTask;
    if (shouldMergeMovePaths) {
        workForMoveOnlyTask = mergeMovePaths(condInputPath, moveTaskToLink.getWork());
    } else {
        workForMoveOnlyTask = dummyMoveWork;
    }
    // There are 3 options for this ConditionalTask:
    // 1) Merge the partitions
    // 2) Move the partitions (i.e. don't merge the partitions)
    // 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't
    // merge others) in this case the merge is done first followed by the move to prevent
    // conflicts.
    Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(workForMoveOnlyTask, conf);
    Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(dummyMoveWork, conf);
    // NOTE! It is necessary merge task is the parent of the move task, and not
    // the other way around, for the proper execution of the execute method of
    // ConditionalTask
    mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask);
    List<Serializable> listWorks = new ArrayList<Serializable>();
    listWorks.add(workForMoveOnlyTask);
    listWorks.add(mergeWork);
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    listTasks.add(moveOnlyMoveTask);
    listTasks.add(mergeOnlyMergeTask);
    listTasks.add(mergeAndMoveMergeTask);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf);
    cndTsk.setListTasks(listTasks);
    // create resolver
    cndTsk.setResolver(new ConditionalResolverMergeFiles());
    ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, condInputPath.toString());
    cndTsk.setResolverCtx(mrCtx);
    // make the conditional task as the child of the current leaf task
    currTask.addDependentTask(cndTsk);
    if (shouldMergeMovePaths) {
        // If a new MoveWork was created, then we should link all dependent tasks from the MoveWork to link.
        if (moveTaskToLink.getDependentTasks() != null) {
            for (Task dependentTask : moveTaskToLink.getDependentTasks()) {
                moveOnlyMoveTask.addDependentTask(dependentTask);
            }
        }
    } else {
        addDependentMoveTasks(moveTaskToLink, conf, moveOnlyMoveTask, dependencyTask);
    }
    addDependentMoveTasks(moveTaskToLink, conf, mergeOnlyMergeTask, dependencyTask);
    addDependentMoveTasks(moveTaskToLink, conf, mergeAndMoveMoveTask, dependencyTask);
    return cndTsk;
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalResolverMergeFiles(org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles) Serializable(java.io.Serializable) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) ConditionalResolverMergeFilesCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx)

Aggregations

Serializable (java.io.Serializable)1 ArrayList (java.util.ArrayList)1 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)1 DependencyCollectionTask (org.apache.hadoop.hive.ql.exec.DependencyCollectionTask)1 MoveTask (org.apache.hadoop.hive.ql.exec.MoveTask)1 Task (org.apache.hadoop.hive.ql.exec.Task)1 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)1 SparkTask (org.apache.hadoop.hive.ql.exec.spark.SparkTask)1 ConditionalResolverMergeFiles (org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles)1 ConditionalResolverMergeFilesCtx (org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx)1 ConditionalWork (org.apache.hadoop.hive.ql.plan.ConditionalWork)1 MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)1