Search in sources :

Example 1 with AddDependencyToLeaves

use of org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves in project hive by apache.

the class ReplLoadTask method createEndReplLogTask.

private void createEndReplLogTask(Context context, Scope scope, ReplLogger replLogger) throws SemanticException {
    Map<String, String> dbProps;
    if (work.isIncrementalLoad()) {
        dbProps = new HashMap<>();
        dbProps.put(ReplicationSpec.KEY.CURR_STATE_ID_SOURCE.toString(), work.incrementalLoadTasksBuilder().eventTo().toString());
    } else {
        Database dbInMetadata = work.databaseEvent(context.hiveConf).dbInMetadata(work.dbNameToLoadIn);
        dbProps = dbInMetadata.getParameters();
    }
    ReplStateLogWork replLogWork = new ReplStateLogWork(replLogger, dbProps, (new Path(work.dumpDirectory).getParent()).toString(), work.getMetricCollector(), work.shouldFailover());
    Task<ReplStateLogWork> replLogTask = TaskFactory.get(replLogWork, conf);
    if (scope.rootTasks.isEmpty()) {
        scope.rootTasks.add(replLogTask);
    } else {
        DAGTraversal.traverse(scope.rootTasks, new AddDependencyToLeaves(Collections.singletonList(replLogTask)));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) LoadDatabase(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase) Database(org.apache.hadoop.hive.metastore.api.Database) AlterDatabase(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase.AlterDatabase) AddDependencyToLeaves(org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves)

Example 2 with AddDependencyToLeaves

use of org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves in project hive by apache.

the class ReplLoadTask method createBuilderTask.

private void createBuilderTask(List<Task<?>> rootTasks) {
    // Use loadTask as dependencyCollection
    Task<ReplLoadWork> loadTask = TaskFactory.get(work, conf);
    DAGTraversal.traverse(rootTasks, new AddDependencyToLeaves(loadTask));
}
Also used : AddDependencyToLeaves(org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves)

Example 3 with AddDependencyToLeaves

use of org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves in project hive by apache.

the class ReplLoadTask method executeIncrementalLoad.

private int executeIncrementalLoad(long loadStartTime) throws Exception {
    // that are excluded in the new replication policy.
    if (work.replScopeModified) {
        dropTablesExcludedInReplScope(work.currentReplScope);
    }
    Database targetDb = getHive().getDatabase(work.dbNameToLoadIn);
    Map<String, String> props = new HashMap<>();
    // Check if it is a optimised bootstrap failover.
    if (work.isFirstFailover) {
        // Check it should be marked as target of replication & not source of replication.
        if (MetaStoreUtils.isTargetOfReplication(targetDb)) {
            LOG.error("The database {} is already marked as target for replication", targetDb.getName());
            throw new Exception("Failover target is already marked as target");
        }
        if (!ReplChangeManager.isSourceOfReplication(targetDb)) {
            LOG.error("The database {} is already source of replication.", targetDb.getName());
            throw new Exception("Failover target was not source of replication");
        }
        boolean isTableDiffPresent = checkFileExists(new Path(work.dumpDirectory).getParent(), conf, TABLE_DIFF_COMPLETE_DIRECTORY);
        Long eventId = Long.parseLong(getEventIdFromFile(new Path(work.dumpDirectory).getParent(), conf)[0]);
        if (!isTableDiffPresent) {
            prepareTableDiffFile(eventId, getHive(), work, conf);
            if (this.childTasks == null) {
                this.childTasks = new ArrayList<>();
            }
            createReplLoadCompleteAckTask();
            return 0;
        }
    } else if (work.isSecondFailover) {
        // DROP the tables to be bootstrapped.
        Hive db = getHive();
        for (String table : work.tablesToBootstrap) {
            db.dropTable(work.dbNameToLoadIn + "." + table, true);
        }
    }
    if (!MetaStoreUtils.isTargetOfReplication(targetDb)) {
        props.put(ReplConst.TARGET_OF_REPLICATION, ReplConst.TRUE);
    }
    if (!work.shouldFailover() && MetaStoreUtils.isDbBeingFailedOver(targetDb)) {
        props.put(ReplConst.REPL_FAILOVER_ENDPOINT, "");
    }
    if (!props.isEmpty()) {
        AlterDatabaseSetPropertiesDesc setTargetDesc = new AlterDatabaseSetPropertiesDesc(work.dbNameToLoadIn, props, null);
        Task<?> addReplTargetPropTask = TaskFactory.get(new DDLWork(new HashSet<>(), new HashSet<>(), setTargetDesc, true, work.dumpDirectory, work.getMetricCollector()), conf);
        if (this.childTasks == null) {
            this.childTasks = new ArrayList<>();
        }
        this.childTasks.add(addReplTargetPropTask);
    }
    IncrementalLoadTasksBuilder builder = work.incrementalLoadTasksBuilder();
    // If incremental events are already applied, then check and perform if need to bootstrap any tables.
    if (!builder.hasMoreWork() && work.isLastReplIDUpdated()) {
        if (work.hasBootstrapLoadTasks()) {
            LOG.debug("Current incremental dump have tables to be bootstrapped. Switching to bootstrap " + "mode after applying all events.");
            return executeBootStrapLoad();
        }
    }
    List<Task<?>> childTasks = new ArrayList<>();
    int maxTasks = conf.getIntVar(HiveConf.ConfVars.REPL_APPROX_MAX_LOAD_TASKS);
    TaskTracker tracker = new TaskTracker(maxTasks);
    addLazyDataCopyTask(tracker, builder.getReplLogger());
    childTasks.add(builder.build(context, getHive(), LOG, tracker));
    // incremental cycle won't consider the events in this dump again if it starts from this id.
    if (!builder.hasMoreWork()) {
        // The name of the database to be loaded into is either specified directly in REPL LOAD
        // command i.e. when dbNameToLoadIn has a valid dbname or is available through dump
        // metadata during table level replication.
        String dbName = work.dbNameToLoadIn;
        if (dbName == null || StringUtils.isBlank(dbName)) {
            if (work.currentReplScope != null) {
                String replScopeDbName = work.currentReplScope.getDbName();
                if (replScopeDbName != null && !"*".equals(replScopeDbName)) {
                    dbName = replScopeDbName;
                }
            }
        }
        // update repl id in all those databases.
        if (StringUtils.isNotBlank(dbName)) {
            String lastEventid = builder.eventTo().toString();
            Map<String, String> mapProp = new HashMap<>();
            mapProp.put(ReplicationSpec.KEY.CURR_STATE_ID_SOURCE.toString(), lastEventid);
            AlterDatabaseSetPropertiesDesc alterDbDesc = new AlterDatabaseSetPropertiesDesc(dbName, mapProp, new ReplicationSpec(lastEventid, lastEventid));
            Task<?> updateReplIdTask = TaskFactory.get(new DDLWork(new HashSet<>(), new HashSet<>(), alterDbDesc, true, (new Path(work.dumpDirectory).getParent()).toString(), work.getMetricCollector()), conf);
            DAGTraversal.traverse(childTasks, new AddDependencyToLeaves(updateReplIdTask));
            work.setLastReplIDUpdated(true);
            LOG.debug("Added task to set last repl id of db " + dbName + " to " + lastEventid);
        }
    }
    // Once all the incremental events are applied, enable bootstrap of tables if exist.
    if (builder.hasMoreWork() || work.hasBootstrapLoadTasks()) {
        DAGTraversal.traverse(childTasks, new AddDependencyToLeaves(TaskFactory.get(work, conf)));
    }
    if (this.childTasks == null) {
        this.childTasks = new ArrayList<>();
    }
    this.childTasks.addAll(childTasks);
    createReplLoadCompleteAckTask();
    // Clean-up snapshots
    if (conf.getBoolVar(REPL_SNAPSHOT_DIFF_FOR_EXTERNAL_TABLE_COPY)) {
        cleanupSnapshots(new Path(work.getDumpDirectory()).getParent().getParent().getParent(), work.getSourceDbName().toLowerCase(), conf, null, true);
    }
    // pass the current time at the end of repl-load stage as the starting time of the first event.
    long currentTimestamp = System.currentTimeMillis();
    ((IncrementalLoadLogger) work.incrementalLoadTasksBuilder().getReplLogger()).initiateEventTimestamp(currentTimestamp);
    LOG.info("REPL_INCREMENTAL_LOAD stage duration : {} ms", currentTimestamp - loadStartTime);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) TaskTracker(org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker) Task(org.apache.hadoop.hive.ql.exec.Task) ReplicationSpec(org.apache.hadoop.hive.ql.parse.ReplicationSpec) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) TException(org.apache.thrift.TException) IOException(java.io.IOException) LoadConstraint(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadConstraint) IncrementalLoadLogger(org.apache.hadoop.hive.ql.parse.repl.load.log.IncrementalLoadLogger) Hive(org.apache.hadoop.hive.ql.metadata.Hive) IncrementalLoadTasksBuilder(org.apache.hadoop.hive.ql.exec.repl.incremental.IncrementalLoadTasksBuilder) DDLWork(org.apache.hadoop.hive.ql.ddl.DDLWork) LoadDatabase(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase) Database(org.apache.hadoop.hive.metastore.api.Database) AlterDatabase(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase.AlterDatabase) AlterDatabaseSetPropertiesDesc(org.apache.hadoop.hive.ql.ddl.database.alter.poperties.AlterDatabaseSetPropertiesDesc) HashSet(java.util.HashSet) AddDependencyToLeaves(org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves)

Example 4 with AddDependencyToLeaves

use of org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves in project hive by apache.

the class ReplDumpTask method initiateDataCopyTasks.

private void initiateDataCopyTasks() throws HiveException, IOException {
    TaskTracker taskTracker = new TaskTracker(conf.getIntVar(HiveConf.ConfVars.REPL_APPROX_MAX_LOAD_TASKS));
    if (childTasks == null) {
        childTasks = new ArrayList<>();
    }
    List<Task<?>> externalTableCopyTasks = work.externalTableCopyTasks(taskTracker, conf);
    childTasks.addAll(externalTableCopyTasks);
    LOG.debug("Scheduled {} external table copy tasks", externalTableCopyTasks.size());
    // If external table data copy tasks are present add a task to mark the end of data copy
    if (!externalTableCopyTasks.isEmpty() && !work.getExternalTblCopyPathIterator().hasNext()) {
        ReplUtils.addLoggerTask(work.getReplLogger(), childTasks, conf);
    }
    childTasks.addAll(work.managedTableCopyTasks(taskTracker, conf));
    childTasks.addAll(work.functionsBinariesCopyTasks(taskTracker, conf));
    if (childTasks.isEmpty()) {
        // All table data copy work finished.
        finishRemainingTasks();
    } else {
        DAGTraversal.traverse(childTasks, new AddDependencyToLeaves(TaskFactory.get(work, conf)));
    }
}
Also used : TaskTracker(org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker) Task(org.apache.hadoop.hive.ql.exec.Task) AddDependencyToLeaves(org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves)

Example 5 with AddDependencyToLeaves

use of org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves in project hive by apache.

the class IncrementalLoadTasksBuilder method build.

public Task<?> build(Context context, Hive hive, Logger log, TaskTracker tracker) throws Exception {
    long builderStartTime = System.currentTimeMillis();
    Task<?> evTaskRoot = TaskFactory.get(new DependencyCollectionWork());
    Task<?> taskChainTail = evTaskRoot;
    Long lastReplayedEvent = null;
    this.log = log;
    numIteration++;
    this.log.debug("Iteration num " + numIteration);
    while (iterator.hasNext() && tracker.canAddMoreTasks()) {
        FileStatus dir = iterator.next();
        String location = dir.getPath().toUri().toString();
        DumpMetaData eventDmd = new DumpMetaData(new Path(location), conf);
        if (!shouldReplayEvent(dir, eventDmd.getDumpType(), dbName)) {
            this.log.debug("Skipping event {} from {} for DB {} maxTasks: {}", eventDmd.getDumpType(), dir.getPath().toUri(), dbName, tracker.numberOfTasks());
            continue;
        }
        this.log.debug("Loading event {} from {} for DB {} maxTasks: {}", eventDmd.getDumpType(), dir.getPath().toUri(), dbName, tracker.numberOfTasks());
        // event loads will behave similar to table loads, with one crucial difference
        // precursor order is strict, and each event must be processed after the previous one.
        // The way we handle this strict order is as follows:
        // First, we start with a taskChainTail which is a dummy noop task (a DependecyCollectionTask)
        // at the head of our event chain. For each event we process, we tell analyzeTableLoad to
        // create tasks that use the taskChainTail as a dependency. Then, we collect all those tasks
        // and introduce a new barrier task(also a DependencyCollectionTask) which depends on all
        // these tasks. Then, this barrier task becomes our new taskChainTail. Thus, we get a set of
        // tasks as follows:
        // 
        // --->ev1.task1--                          --->ev2.task1--
        // /               \                        /               \
        // evTaskRoot-->*---->ev1.task2---*--> ev1.barrierTask-->*---->ev2.task2---*->evTaskChainTail
        // \               /
        // --->ev1.task3--
        // 
        // Once this entire chain is generated, we add evTaskRoot to rootTasks, so as to execute the
        // entire chain
        MessageHandler.Context mhContext = new MessageHandler.Context(dbName, location, taskChainTail, eventDmd, conf, hive, context, this.log, dumpDirectory, metricCollector);
        List<Task<?>> evTasks = analyzeEventLoad(mhContext);
        if ((evTasks != null) && (!evTasks.isEmpty())) {
            ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, metricCollector, dir.getPath().getName(), eventDmd.getDumpType().toString(), dumpDirectory);
            Task<?> barrierTask = TaskFactory.get(replStateLogWork, conf);
            AddDependencyToLeaves function = new AddDependencyToLeaves(barrierTask);
            DAGTraversal.traverse(evTasks, function);
            this.log.debug("Updated taskChainTail from {}:{} to {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
            tracker.addTaskList(taskChainTail.getChildTasks());
            taskChainTail = barrierTask;
        }
        lastReplayedEvent = eventDmd.getEventTo();
    }
    if (!hasMoreWork()) {
        ReplRemoveFirstIncLoadPendFlagDesc desc = new ReplRemoveFirstIncLoadPendFlagDesc(dbName);
        Task<?> updateIncPendTask = TaskFactory.get(new DDLWork(inputs, outputs, desc, true, dumpDirectory, this.metricCollector), conf);
        taskChainTail.addDependentTask(updateIncPendTask);
        taskChainTail = updateIncPendTask;
        Map<String, String> dbProps = new HashMap<>();
        dbProps.put(ReplicationSpec.KEY.CURR_STATE_ID_SOURCE.toString(), String.valueOf(lastReplayedEvent));
        ReplStateLogWork replStateLogWork = new ReplStateLogWork(replLogger, dbProps, dumpDirectory, metricCollector, shouldFailover);
        Task<?> barrierTask = TaskFactory.get(replStateLogWork, conf);
        taskChainTail.addDependentTask(barrierTask);
        this.log.debug("Added {}:{} as a precursor of barrier task {}:{}", taskChainTail.getClass(), taskChainTail.getId(), barrierTask.getClass(), barrierTask.getId());
    }
    this.log.info("REPL_INCREMENTAL_LOAD task-builder iteration #{}, duration : {} ms", numIteration, System.currentTimeMillis() - builderStartTime);
    return evTaskRoot;
}
Also used : Path(org.apache.hadoop.fs.Path) Context(org.apache.hadoop.hive.ql.Context) Task(org.apache.hadoop.hive.ql.exec.Task) FileStatus(org.apache.hadoop.fs.FileStatus) MessageHandler(org.apache.hadoop.hive.ql.parse.repl.load.message.MessageHandler) ReplStateLogWork(org.apache.hadoop.hive.ql.exec.repl.ReplStateLogWork) HashMap(java.util.HashMap) DumpMetaData(org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData) DependencyCollectionWork(org.apache.hadoop.hive.ql.plan.DependencyCollectionWork) DDLWork(org.apache.hadoop.hive.ql.ddl.DDLWork) ReplRemoveFirstIncLoadPendFlagDesc(org.apache.hadoop.hive.ql.ddl.misc.flags.ReplRemoveFirstIncLoadPendFlagDesc) AddDependencyToLeaves(org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves)

Aggregations

AddDependencyToLeaves (org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves)11 Task (org.apache.hadoop.hive.ql.exec.Task)6 Path (org.apache.hadoop.fs.Path)5 HashMap (java.util.HashMap)4 AlterDatabase (org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase.AlterDatabase)4 ArrayList (java.util.ArrayList)3 Database (org.apache.hadoop.hive.metastore.api.Database)3 LoadDatabase (org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase)3 TaskTracker (org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker)3 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)3 DependencyCollectionWork (org.apache.hadoop.hive.ql.plan.DependencyCollectionWork)3 LinkedHashMap (java.util.LinkedHashMap)2 Map (java.util.Map)2 DDLWork (org.apache.hadoop.hive.ql.ddl.DDLWork)2 ReplStateLogWork (org.apache.hadoop.hive.ql.exec.repl.ReplStateLogWork)2 Hive (org.apache.hadoop.hive.ql.metadata.Hive)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 TException (org.apache.thrift.TException)2 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1