Search in sources :

Example 16 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenMapRedUtils method findMoveTask.

public static Task<MoveWork> findMoveTask(List<Task<MoveWork>> mvTasks, FileSinkOperator fsOp) {
    // find the move task
    for (Task<MoveWork> mvTsk : mvTasks) {
        MoveWork mvWork = mvTsk.getWork();
        Path srcDir = null;
        if (mvWork.getLoadFileWork() != null) {
            srcDir = mvWork.getLoadFileWork().getSourcePath();
        } else if (mvWork.getLoadTableWork() != null) {
            srcDir = mvWork.getLoadTableWork().getSourcePath();
        }
        if ((srcDir != null) && (srcDir.equals(fsOp.getConf().getFinalDirName()))) {
            return mvTsk;
        }
    }
    return null;
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) Path(org.apache.hadoop.fs.Path)

Example 17 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenMapRedUtils method mergeMovePaths.

/**
   * Merges the given Conditional input path and the linked MoveWork into one only MoveWork.
   * This is an optimization for BlobStore systems to avoid doing two renames or copies that are not necessary.
   *
   * @param condInputPath A path that the ConditionalTask uses as input for its sub-tasks.
   * @param linkedMoveWork A MoveWork that the ConditionalTask uses to link to its sub-tasks.
   * @return A new MoveWork that has the Conditional input path as source and the linkedMoveWork as target.
   */
@VisibleForTesting
protected static MoveWork mergeMovePaths(Path condInputPath, MoveWork linkedMoveWork) {
    MoveWork newWork = new MoveWork(linkedMoveWork);
    LoadFileDesc fileDesc = null;
    LoadTableDesc tableDesc = null;
    if (linkedMoveWork.getLoadFileWork() != null) {
        fileDesc = new LoadFileDesc(linkedMoveWork.getLoadFileWork());
        fileDesc.setSourcePath(condInputPath);
    } else if (linkedMoveWork.getLoadTableWork() != null) {
        tableDesc = new LoadTableDesc(linkedMoveWork.getLoadTableWork());
        tableDesc.setSourcePath(condInputPath);
    } else {
        throw new IllegalArgumentException("Merging a path with a MoveWork with multi-files work is not allowed.");
    }
    newWork.setLoadFileWork(fileDesc);
    newWork.setLoadTableWork(tableDesc);
    return newWork;
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) LoadFileDesc(org.apache.hadoop.hive.ql.plan.LoadFileDesc) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 18 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenMapRedUtils method createCondTask.

/**
   * Construct a conditional task given the current leaf task, the MoveWork and the MapredWork.
   *
   * @param conf
   *          HiveConf
   * @param currTask
   *          current leaf task
   * @param dummyMoveWork
   *          MoveWork for the move task
   * @param mergeWork
   *          MapredWork for the merge task.
   * @param condInputPath
   *          the input directory of the merge/move task
   * @param condOutputPath
   *          the output directory of the merge/move task
   * @param moveTaskToLink
   *          a MoveTask that may be linked to the conditional sub-tasks
   * @param dependencyTask
   *          a dependency task that may be linked to the conditional sub-tasks
   * @return The conditional task
   */
private static ConditionalTask createCondTask(HiveConf conf, Task<? extends Serializable> currTask, MoveWork dummyMoveWork, Serializable mergeWork, Path condInputPath, Path condOutputPath, Task<MoveWork> moveTaskToLink, DependencyCollectionTask dependencyTask) {
    boolean shouldMergeMovePaths = (moveTaskToLink != null && dependencyTask == null && shouldMergeMovePaths(conf, condInputPath, condOutputPath, moveTaskToLink.getWork()));
    MoveWork workForMoveOnlyTask;
    if (shouldMergeMovePaths) {
        workForMoveOnlyTask = mergeMovePaths(condInputPath, moveTaskToLink.getWork());
    } else {
        workForMoveOnlyTask = dummyMoveWork;
    }
    // There are 3 options for this ConditionalTask:
    // 1) Merge the partitions
    // 2) Move the partitions (i.e. don't merge the partitions)
    // 3) Merge some partitions and move other partitions (i.e. merge some partitions and don't
    // merge others) in this case the merge is done first followed by the move to prevent
    // conflicts.
    Task<? extends Serializable> mergeOnlyMergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> moveOnlyMoveTask = TaskFactory.get(workForMoveOnlyTask, conf);
    Task<? extends Serializable> mergeAndMoveMergeTask = TaskFactory.get(mergeWork, conf);
    Task<? extends Serializable> mergeAndMoveMoveTask = TaskFactory.get(dummyMoveWork, conf);
    // NOTE! It is necessary merge task is the parent of the move task, and not
    // the other way around, for the proper execution of the execute method of
    // ConditionalTask
    mergeAndMoveMergeTask.addDependentTask(mergeAndMoveMoveTask);
    List<Serializable> listWorks = new ArrayList<Serializable>();
    listWorks.add(workForMoveOnlyTask);
    listWorks.add(mergeWork);
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    listTasks.add(moveOnlyMoveTask);
    listTasks.add(mergeOnlyMergeTask);
    listTasks.add(mergeAndMoveMergeTask);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, conf);
    cndTsk.setListTasks(listTasks);
    // create resolver
    cndTsk.setResolver(new ConditionalResolverMergeFiles());
    ConditionalResolverMergeFilesCtx mrCtx = new ConditionalResolverMergeFilesCtx(listTasks, condInputPath.toString());
    cndTsk.setResolverCtx(mrCtx);
    // make the conditional task as the child of the current leaf task
    currTask.addDependentTask(cndTsk);
    if (shouldMergeMovePaths) {
        // If a new MoveWork was created, then we should link all dependent tasks from the MoveWork to link.
        if (moveTaskToLink.getDependentTasks() != null) {
            for (Task dependentTask : moveTaskToLink.getDependentTasks()) {
                moveOnlyMoveTask.addDependentTask(dependentTask);
            }
        }
    } else {
        addDependentMoveTasks(moveTaskToLink, conf, moveOnlyMoveTask, dependencyTask);
    }
    addDependentMoveTasks(moveTaskToLink, conf, mergeOnlyMergeTask, dependencyTask);
    addDependentMoveTasks(moveTaskToLink, conf, mergeAndMoveMoveTask, dependencyTask);
    return cndTsk;
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) ConditionalResolverMergeFiles(org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles) Serializable(java.io.Serializable) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) ConditionalResolverMergeFilesCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverMergeFiles.ConditionalResolverMergeFilesCtx)

Example 19 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class GenMapRedUtils method findMoveTaskForFsopOutput.

public static Task<MoveWork> findMoveTaskForFsopOutput(List<Task<MoveWork>> mvTasks, Path fsopFinalDir, boolean isMmFsop) {
    // find the move task
    for (Task<MoveWork> mvTsk : mvTasks) {
        MoveWork mvWork = mvTsk.getWork();
        Path srcDir = null;
        boolean isLfd = false;
        if (mvWork.getLoadFileWork() != null) {
            srcDir = mvWork.getLoadFileWork().getSourcePath();
            isLfd = true;
            if (isMmFsop) {
                srcDir = srcDir.getParent();
            }
        } else if (mvWork.getLoadTableWork() != null) {
            srcDir = mvWork.getLoadTableWork().getSourcePath();
        }
        if (Utilities.FILE_OP_LOGGER.isTraceEnabled()) {
            Utilities.FILE_OP_LOGGER.trace("Observing MoveWork " + System.identityHashCode(mvWork) + " with " + srcDir + "(from " + (isLfd ? "LFD" : "LTD") + ") while looking for " + fsopFinalDir + "(mm = " + isMmFsop + ")");
        }
        if ((srcDir != null) && srcDir.equals(fsopFinalDir)) {
            return mvTsk;
        }
    }
    return null;
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) Path(org.apache.hadoop.fs.Path)

Example 20 with MoveWork

use of org.apache.hadoop.hive.ql.plan.MoveWork in project hive by apache.

the class LoadPartitions method movePartitionTask.

/**
 * This will create the move of partition data from temp path to actual path
 */
private Task<?> movePartitionTask(Table table, AddPartitionDesc.OnePartitionDesc partSpec, Path tmpPath) {
    // Note: this sets LoadFileType incorrectly for ACID; is that relevant for load?
    // See setLoadFileType and setIsAcidIow calls elsewhere for an example.
    LoadTableDesc loadTableWork = new LoadTableDesc(tmpPath, Utilities.getTableDesc(table), partSpec.getPartSpec(), event.replicationSpec().isReplace() ? LoadFileType.REPLACE_ALL : LoadFileType.OVERWRITE_EXISTING, SessionState.get().getTxnMgr().getCurrentTxnId());
    loadTableWork.setInheritTableSpecs(false);
    MoveWork work = new MoveWork(new HashSet<>(), new HashSet<>(), loadTableWork, null, false);
    return TaskFactory.get(work, context.hiveConf);
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc)

Aggregations

MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)29 Path (org.apache.hadoop.fs.Path)21 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)16 LoadFileDesc (org.apache.hadoop.hive.ql.plan.LoadFileDesc)9 Test (org.junit.Test)7 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)6 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)6 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)6 Task (org.apache.hadoop.hive.ql.exec.Task)5 DDLWork (org.apache.hadoop.hive.ql.plan.DDLWork)5 Serializable (java.io.Serializable)4 ArrayList (java.util.ArrayList)4 Context (org.apache.hadoop.hive.ql.Context)4 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)4 MoveTask (org.apache.hadoop.hive.ql.exec.MoveTask)4 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)4 Partition (org.apache.hadoop.hive.ql.metadata.Partition)4 BasicStatsWork (org.apache.hadoop.hive.ql.plan.BasicStatsWork)4 StatsWork (org.apache.hadoop.hive.ql.plan.StatsWork)4 URISyntaxException (java.net.URISyntaxException)3