Search in sources :

Example 1 with ReplRemoveFirstIncLoadPendFlagDesc

use of org.apache.hadoop.hive.ql.ddl.misc.flags.ReplRemoveFirstIncLoadPendFlagDesc in project hive by apache.

the class IncrementalLoadTasksBuilder method build.

public Task<?> build(Context context, Hive hive, Logger log, TaskTracker tracker) throws Exception {
    long builderStartTime = System.currentTimeMillis();
    Task<?> evTaskRoot = TaskFactory.get(new DependencyCollectionWork());
    Task<?> taskChainTail = evTaskRoot;
    Long lastReplayedEvent = null;
    this.log = log;
    numIteration++;
    this.log.debug("Iteration num " + numIteration);
    while (iterator.hasNext() && tracker.canAddMoreTasks()) {
        FileStatus dir = iterator.next();
        String location = dir.getPath().toUri().toString();
        DumpMetaData eventDmd = new DumpMetaData(new Path(location), conf);
        if (!shouldReplayEvent(dir, eventDmd.getDumpType(), dbName)) {
            this.log.debug("Skipping event {} from {} for DB {} maxTasks: {}", eventDmd.getDumpType(), dir.getPath().toUri(), dbName, tracker.numberOfTasks());
            continue;
        }
        this.log.debug("Loading event {} from {} for DB {} maxTasks: {}", eventDmd.getDumpType(), dir.getPath().toUri(), dbName, tracker.numberOfTasks());
        // event loads will behave similar to table loads, with one crucial difference
        // precursor order is strict, and each event must be processed after the previous one.
        // The way we handle this strict order is as follows:
        // First, we start with a taskChainTail which is a dummy noop task (a DependecyCollectionTask)
        // at the head of our event chain. For each event we process, we tell analyzeTableLoad to
        // create tasks that use the taskChainTail as a dependency. Then, we collect all those tasks
        // and introduce a new barrier task(also a DependencyCollectionTask) which depends on all
        // these tasks. Then, this barrier task becomes our new taskChainTail. Thus, we get a set of
        // tasks as follows:
        // 
        // --->ev1.task1--                          --->ev2.task1--
        // /               \                        /               \
        // evTaskRoot-->*---->ev1.task2---*--> ev1.barrierTask-->*---->ev2.task2---*->evTaskChainTail
        // \               /
        // --->ev1.task3--
        // 
        // Once this entire chain is generated, we add evTaskRoot to rootTasks, so as to execute the
        // entire chain
        MessageHandler.Context mhContext = new MessageHandler.Context(dbName, location, taskChainTail, eventDmd, conf, hive, context, this.log, dumpDirectory, metricCollector);
        List<Task<?>> evTasks = analyzeEventLoad(mhContext);
        if ((evTasks != null) && (!evTasks.isEmpty())) {
            ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, metricCollector, dir.getPath().getName(), eventDmd.getDumpType().toString(), dumpDirectory);
            Task<?> barrierTask = TaskFactory.get(replStateLogWork, conf);
            AddDependencyToLeaves function = new AddDependencyToLeaves(barrierTask);
            DAGTraversal.traverse(evTasks, function);
            this.log.debug("Updated taskChainTail from {}:{} to {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
            tracker.addTaskList(taskChainTail.getChildTasks());
            taskChainTail = barrierTask;
        }
        lastReplayedEvent = eventDmd.getEventTo();
    }
    if (!hasMoreWork()) {
        ReplRemoveFirstIncLoadPendFlagDesc desc = new ReplRemoveFirstIncLoadPendFlagDesc(dbName);
        Task<?> updateIncPendTask = TaskFactory.get(new DDLWork(inputs, outputs, desc, true, dumpDirectory, this.metricCollector), conf);
        taskChainTail.addDependentTask(updateIncPendTask);
        taskChainTail = updateIncPendTask;
        Map<String, String> dbProps = new HashMap<>();
        dbProps.put(ReplicationSpec.KEY.CURR_STATE_ID_SOURCE.toString(), String.valueOf(lastReplayedEvent));
        ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, dbProps, dumpDirectory, metricCollector, shouldFailover);
        Task<?> barrierTask = TaskFactory.get(replStateLogWork, conf);
        taskChainTail.addDependentTask(barrierTask);
        this.log.debug("Added {}:{} as a precursor of barrier task {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
    }
    this.log.info("REPL_INCREMENTAL_LOAD task-builder iteration #{}, duration : {} ms", numIteration, System.currentTimeMillis() - builderStartTime);
    return evTaskRoot;
}
Also used : Path(org.apache.hadoop.fs.Path) Context(org.apache.hadoop.hive.ql.Context) Task(org.apache.hadoop.hive.ql.exec.Task) FileStatus(org.apache.hadoop.fs.FileStatus) MessageHandler(org.apache.hadoop.hive.ql.parse.repl.load.message.MessageHandler) ReplStateLogWork(org.apache.hadoop.hive.ql.exec.repl.ReplStateLogWork) HashMap(java.util.HashMap) DumpMetaData(org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData) DependencyCollectionWork(org.apache.hadoop.hive.ql.plan.DependencyCollectionWork) DDLWork(org.apache.hadoop.hive.ql.ddl.DDLWork) ReplRemoveFirstIncLoadPendFlagDesc(org.apache.hadoop.hive.ql.ddl.misc.flags.ReplRemoveFirstIncLoadPendFlagDesc) AddDependencyToLeaves(org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves)

Aggregations

HashMap (java.util.HashMap)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 Path (org.apache.hadoop.fs.Path)1 Context (org.apache.hadoop.hive.ql.Context)1 DDLWork (org.apache.hadoop.hive.ql.ddl.DDLWork)1 ReplRemoveFirstIncLoadPendFlagDesc (org.apache.hadoop.hive.ql.ddl.misc.flags.ReplRemoveFirstIncLoadPendFlagDesc)1 Task (org.apache.hadoop.hive.ql.exec.Task)1 ReplStateLogWork (org.apache.hadoop.hive.ql.exec.repl.ReplStateLogWork)1 AddDependencyToLeaves (org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves)1 DumpMetaData (org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData)1 MessageHandler (org.apache.hadoop.hive.ql.parse.repl.load.message.MessageHandler)1 DependencyCollectionWork (org.apache.hadoop.hive.ql.plan.DependencyCollectionWork)1