Search in sources :

Example 1 with TaskTracker

use of org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker in project hive by apache.

the class ReplLoadTask method executeBootStrapLoad.

private int executeBootStrapLoad() throws Exception {
    int maxTasks = conf.getIntVar(HiveConf.ConfVars.REPL_APPROX_MAX_LOAD_TASKS);
    Context loadContext = new Context(work.dumpDirectory, conf, getHive(), work.sessionStateLineageState, context);
    TaskTracker loadTaskTracker = new TaskTracker(maxTasks);
    BootstrapEventsIterator iterator = work.bootstrapIterator();
    addLazyDataCopyTask(loadTaskTracker, iterator.replLogger());
    /*
        for now for simplicity we are doing just one directory ( one database ), come back to use
        of multiple databases once we have the basic flow to chain creating of tasks in place for
        a database ( directory )
    */
    ConstraintEventsIterator constraintIterator = work.constraintsIterator();
    /*
    This is used to get hold of a reference during the current creation of tasks and is initialized
    with "0" tasks such that it will be non consequential in any operations done with task tracker
    compositions.
     */
    TaskTracker dbTracker = new TaskTracker(ZERO_TASKS);
    TaskTracker tableTracker = new TaskTracker(ZERO_TASKS);
    Scope scope = new Scope();
    boolean loadingConstraint = false;
    if (!iterator.hasNext() && constraintIterator.hasNext()) {
        loadingConstraint = true;
    }
    boolean dbEventFound = false;
    while ((iterator.hasNext() || (loadingConstraint && constraintIterator.hasNext())) && loadTaskTracker.canAddMoreTasks()) {
        BootstrapEvent next;
        if (!loadingConstraint) {
            next = iterator.next();
        } else {
            next = constraintIterator.next();
        }
        switch(next.eventType()) {
            case Database:
                DatabaseEvent dbEvent = (DatabaseEvent) next;
                dbTracker = new LoadDatabase(loadContext, dbEvent, work.dbNameToLoadIn, loadTaskTracker, work.getMetricCollector()).tasks();
                loadTaskTracker.update(dbTracker);
                if (work.hasDbState()) {
                    loadTaskTracker.update(updateDatabaseLastReplID(maxTasks, loadContext, scope));
                } else {
                    // Scope might have set to database in some previous iteration of loop, so reset it to false if database
                    // tracker has no tasks.
                    scope.database = false;
                }
                work.updateDbEventState(dbEvent.toState());
                if (dbTracker.hasTasks()) {
                    scope.rootTasks.addAll(dbTracker.tasks());
                    scope.database = true;
                    dbEventFound = true;
                }
                dbTracker.debugLog("database");
                break;
            case Table:
                /*
          Implicit assumption here is that database level is processed first before table level,
          which will depend on the iterator used since it should provide the higher level directory
          listing before providing the lower level listing. This is also required such that
          the dbTracker /  tableTracker are setup correctly always.
       */
                TableContext tableContext = new TableContext(dbTracker, work.dbNameToLoadIn);
                FSTableEvent tableEvent = (FSTableEvent) next;
                if (TableType.VIRTUAL_VIEW.name().equals(tableEvent.getMetaData().getTable().getTableType())) {
                    tableTracker = new TaskTracker(1);
                    tableTracker.addTask(createViewTask(tableEvent.getMetaData(), work.dbNameToLoadIn, conf, (new Path(work.dumpDirectory).getParent()).toString(), work.getMetricCollector()));
                } else {
                    LoadTable loadTable = new LoadTable(tableEvent, loadContext, iterator.replLogger(), tableContext, loadTaskTracker, work.getMetricCollector());
                    tableTracker = loadTable.tasks(work.isIncrementalLoad());
                }
                setUpDependencies(dbTracker, tableTracker);
                if (!scope.database && tableTracker.hasTasks()) {
                    scope.rootTasks.addAll(tableTracker.tasks());
                    scope.table = true;
                } else {
                    // Scope might have set to table in some previous iteration of loop, so reset it to false if table
                    // tracker has no tasks.
                    scope.table = false;
                }
                if (!TableType.VIRTUAL_VIEW.name().equals(tableEvent.getMetaData().getTable().getTableType())) {
                    /*
          for table replication if we reach the max number of tasks then for the next run we will
          try to reload the same table again, this is mainly for ease of understanding the code
          as then we can avoid handling == > loading partitions for the table given that
          the creation of table lead to reaching max tasks vs,  loading next table since current
          one does not have partitions.
         */
                    // for a table we explicitly try to load partitions as there is no separate partitions events.
                    LoadPartitions loadPartitions = new LoadPartitions(loadContext, iterator.replLogger(), loadTaskTracker, tableEvent, work.dbNameToLoadIn, tableContext, work.getMetricCollector());
                    TaskTracker partitionsTracker = loadPartitions.tasks();
                    partitionsPostProcessing(iterator, scope, loadTaskTracker, tableTracker, partitionsTracker);
                    tableTracker.debugLog("table");
                    partitionsTracker.debugLog("partitions for table");
                }
                break;
            case Partition:
                /*
          This will happen only when loading tables and we reach the limit of number of tasks we can create;
          hence we know here that the table should exist and there should be a lastPartitionName
      */
                addLoadPartitionTasks(loadContext, next, dbTracker, iterator, scope, loadTaskTracker, tableTracker);
                break;
            case Function:
                loadTaskTracker.update(addLoadFunctionTasks(loadContext, iterator, next, dbTracker, scope));
                break;
            case Constraint:
                loadTaskTracker.update(addLoadConstraintsTasks(loadContext, next, dbTracker, scope));
                break;
            default:
                break;
        }
        if (!loadingConstraint && !iterator.currentDbHasNext()) {
            createEndReplLogTask(loadContext, scope, iterator.replLogger());
        }
        if (dbEventFound && conf.getBoolVar(HiveConf.ConfVars.REPL_RETAIN_CUSTOM_LOCATIONS_FOR_DB_ON_TARGET)) {
            // Force the database creation before the other event like table/partition etc, so that data copy path creation
            // can be achieved.
            LOG.info("Database event found, will be processed exclusively");
            break;
        }
    }
    boolean addAnotherLoadTask = iterator.hasNext() || loadTaskTracker.hasReplicationState() || constraintIterator.hasNext();
    if (addAnotherLoadTask) {
        createBuilderTask(scope.rootTasks);
    }
    // last repl ID of the database.
    if (!iterator.hasNext() && !constraintIterator.hasNext() && !work.isIncrementalLoad()) {
        loadTaskTracker.update(updateDatabaseLastReplID(maxTasks, loadContext, scope));
        work.updateDbEventState(null);
    }
    if (childTasks == null) {
        childTasks = new ArrayList<>();
    }
    childTasks.addAll(scope.rootTasks);
    /*
    Since there can be multiple rounds of this run all of which will be tied to the same
    query id -- generated in compile phase , adding a additional UUID to the end to print each run
    in separate files.
     */
    LOG.info("Root Tasks / Total Tasks : {} / {} ", childTasks.size(), loadTaskTracker.numberOfTasks());
    // Populate the driver context with the scratch dir info from the repl context, so that the
    // temp dirs will be cleaned up later
    context.getFsScratchDirs().putAll(loadContext.pathInfo.getFsScratchDirs());
    if (!HiveConf.getBoolVar(conf, REPL_DUMP_SKIP_IMMUTABLE_DATA_COPY)) {
        createReplLoadCompleteAckTask();
    }
    LOG.info("completed load task run : {}", work.executedLoadTask());
    if (conf.getBoolVar(REPL_SNAPSHOT_DIFF_FOR_EXTERNAL_TABLE_COPY)) {
        Path snapPath = SnapshotUtils.getSnapshotFileListPath(new Path(work.dumpDirectory));
        try {
            SnapshotUtils.getDFS(getExternalTableBaseDir(conf), conf).rename(new Path(snapPath, EximUtil.FILE_LIST_EXTERNAL_SNAPSHOT_CURRENT), new Path(snapPath, EximUtil.FILE_LIST_EXTERNAL_SNAPSHOT_OLD), Options.Rename.OVERWRITE);
        } catch (FileNotFoundException fnf) {
        // Ignore if no file.
        }
    }
    return 0;
}
Also used : Context(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.util.Context) TableContext(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.TableContext) JobContext(org.apache.hadoop.mapreduce.JobContext) TaskTracker(org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker) Path(org.apache.hadoop.fs.Path) BootstrapEvent(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.BootstrapEvent) LoadPartitions(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.LoadPartitions) FSTableEvent(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.filesystem.FSTableEvent) FileNotFoundException(java.io.FileNotFoundException) BootstrapEventsIterator(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.filesystem.BootstrapEventsIterator) LoadTable(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.LoadTable) LoadConstraint(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadConstraint) ReplScope(org.apache.hadoop.hive.common.repl.ReplScope) TableContext(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.TableContext) LoadDatabase(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase) ConstraintEventsIterator(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.filesystem.ConstraintEventsIterator) DatabaseEvent(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.DatabaseEvent)

Example 2 with TaskTracker

use of org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker in project hive by apache.

the class ReplLoadTask method addLoadFunctionTasks.

private TaskTracker addLoadFunctionTasks(Context loadContext, BootstrapEventsIterator iterator, BootstrapEvent next, TaskTracker dbTracker, Scope scope) throws IOException, SemanticException {
    LoadFunction loadFunction = new LoadFunction(loadContext, iterator.replLogger(), (FunctionEvent) next, work.dbNameToLoadIn, dbTracker, (new Path(work.dumpDirectory)).getParent().toString(), work.getMetricCollector());
    TaskTracker functionsTracker = loadFunction.tasks();
    if (!scope.database) {
        scope.rootTasks.addAll(functionsTracker.tasks());
    } else {
        setUpDependencies(dbTracker, functionsTracker);
    }
    functionsTracker.debugLog("functions");
    return functionsTracker;
}
Also used : Path(org.apache.hadoop.fs.Path) TaskTracker(org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker) LoadFunction(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadFunction)

Example 3 with TaskTracker

use of org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker in project hive by apache.

the class ReplLoadTask method addLoadPartitionTasks.

private TaskTracker addLoadPartitionTasks(Context loadContext, BootstrapEvent next, TaskTracker dbTracker, BootstrapEventsIterator iterator, Scope scope, TaskTracker loadTaskTracker, TaskTracker tableTracker) throws Exception {
    PartitionEvent event = (PartitionEvent) next;
    TableContext tableContext = new TableContext(dbTracker, work.dbNameToLoadIn);
    LoadPartitions loadPartitions = new LoadPartitions(loadContext, iterator.replLogger(), tableContext, loadTaskTracker, event.asTableEvent(), work.dbNameToLoadIn, event.lastPartitionReplicated(), work.getMetricCollector(), event.lastPartSpecReplicated(), event.lastStageReplicated());
    /*
             the tableTracker here should be a new instance and not an existing one as this can
             only happen when we break in between loading partitions.
         */
    TaskTracker partitionsTracker = loadPartitions.tasks();
    partitionsPostProcessing(iterator, scope, loadTaskTracker, tableTracker, partitionsTracker);
    partitionsTracker.debugLog("partitions");
    return partitionsTracker;
}
Also used : TaskTracker(org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker) TableContext(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.TableContext) LoadPartitions(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.LoadPartitions) PartitionEvent(org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.PartitionEvent)

Example 4 with TaskTracker

use of org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker in project hive by apache.

the class ReplLoadTask method executeIncrementalLoad.

private int executeIncrementalLoad(long loadStartTime) throws Exception {
    // that are excluded in the new replication policy.
    if (work.replScopeModified) {
        dropTablesExcludedInReplScope(work.currentReplScope);
    }
    Database targetDb = getHive().getDatabase(work.dbNameToLoadIn);
    Map<String, String> props = new HashMap<>();
    // Check if it is a optimised bootstrap failover.
    if (work.isFirstFailover) {
        // Check it should be marked as target of replication & not source of replication.
        if (MetaStoreUtils.isTargetOfReplication(targetDb)) {
            LOG.error("The database {} is already marked as target for replication", targetDb.getName());
            throw new Exception("Failover target is already marked as target");
        }
        if (!ReplChangeManager.isSourceOfReplication(targetDb)) {
            LOG.error("The database {} is already source of replication.", targetDb.getName());
            throw new Exception("Failover target was not source of replication");
        }
        boolean isTableDiffPresent = checkFileExists(new Path(work.dumpDirectory).getParent(), conf, TABLE_DIFF_COMPLETE_DIRECTORY);
        Long eventId = Long.parseLong(getEventIdFromFile(new Path(work.dumpDirectory).getParent(), conf)[0]);
        if (!isTableDiffPresent) {
            prepareTableDiffFile(eventId, getHive(), work, conf);
            if (this.childTasks == null) {
                this.childTasks = new ArrayList<>();
            }
            createReplLoadCompleteAckTask();
            return 0;
        }
    } else if (work.isSecondFailover) {
        // DROP the tables to be bootstrapped.
        Hive db = getHive();
        for (String table : work.tablesToBootstrap) {
            db.dropTable(work.dbNameToLoadIn + "." + table, true);
        }
    }
    if (!MetaStoreUtils.isTargetOfReplication(targetDb)) {
        props.put(ReplConst.TARGET_OF_REPLICATION, ReplConst.TRUE);
    }
    if (!work.shouldFailover() && MetaStoreUtils.isDbBeingFailedOver(targetDb)) {
        props.put(ReplConst.REPL_FAILOVER_ENDPOINT, "");
    }
    if (!props.isEmpty()) {
        AlterDatabaseSetPropertiesDesc setTargetDesc = new AlterDatabaseSetPropertiesDesc(work.dbNameToLoadIn, props, null);
        Task<?> addReplTargetPropTask = TaskFactory.get(new DDLWork(new HashSet<>(), new HashSet<>(), setTargetDesc, true, work.dumpDirectory, work.getMetricCollector()), conf);
        if (this.childTasks == null) {
            this.childTasks = new ArrayList<>();
        }
        this.childTasks.add(addReplTargetPropTask);
    }
    IncrementalLoadTasksBuilder builder = work.incrementalLoadTasksBuilder();
    // If incremental events are already applied, then check and perform if need to bootstrap any tables.
    if (!builder.hasMoreWork() && work.isLastReplIDUpdated()) {
        if (work.hasBootstrapLoadTasks()) {
            LOG.debug("Current incremental dump have tables to be bootstrapped. Switching to bootstrap " + "mode after applying all events.");
            return executeBootStrapLoad();
        }
    }
    List<Task<?>> childTasks = new ArrayList<>();
    int maxTasks = conf.getIntVar(HiveConf.ConfVars.REPL_APPROX_MAX_LOAD_TASKS);
    TaskTracker tracker = new TaskTracker(maxTasks);
    addLazyDataCopyTask(tracker, builder.getReplLogger());
    childTasks.add(builder.build(context, getHive(), LOG, tracker));
    // incremental cycle won't consider the events in this dump again if it starts from this id.
    if (!builder.hasMoreWork()) {
        // The name of the database to be loaded into is either specified directly in REPL LOAD
        // command i.e. when dbNameToLoadIn has a valid dbname or is available through dump
        // metadata during table level replication.
        String dbName = work.dbNameToLoadIn;
        if (dbName == null || StringUtils.isBlank(dbName)) {
            if (work.currentReplScope != null) {
                String replScopeDbName = work.currentReplScope.getDbName();
                if (replScopeDbName != null && !"*".equals(replScopeDbName)) {
                    dbName = replScopeDbName;
                }
            }
        }
        // update repl id in all those databases.
        if (StringUtils.isNotBlank(dbName)) {
            String lastEventid = builder.eventTo().toString();
            Map<String, String> mapProp = new HashMap<>();
            mapProp.put(ReplicationSpec.KEY.CURR_STATE_ID_SOURCE.toString(), lastEventid);
            AlterDatabaseSetPropertiesDesc alterDbDesc = new AlterDatabaseSetPropertiesDesc(dbName, mapProp, new ReplicationSpec(lastEventid, lastEventid));
            Task<?> updateReplIdTask = TaskFactory.get(new DDLWork(new HashSet<>(), new HashSet<>(), alterDbDesc, true, (new Path(work.dumpDirectory).getParent()).toString(), work.getMetricCollector()), conf);
            DAGTraversal.traverse(childTasks, new AddDependencyToLeaves(updateReplIdTask));
            work.setLastReplIDUpdated(true);
            LOG.debug("Added task to set last repl id of db " + dbName + " to " + lastEventid);
        }
    }
    // Once all the incremental events are applied, enable bootstrap of tables if exist.
    if (builder.hasMoreWork() || work.hasBootstrapLoadTasks()) {
        DAGTraversal.traverse(childTasks, new AddDependencyToLeaves(TaskFactory.get(work, conf)));
    }
    if (this.childTasks == null) {
        this.childTasks = new ArrayList<>();
    }
    this.childTasks.addAll(childTasks);
    createReplLoadCompleteAckTask();
    // Clean-up snapshots
    if (conf.getBoolVar(REPL_SNAPSHOT_DIFF_FOR_EXTERNAL_TABLE_COPY)) {
        cleanupSnapshots(new Path(work.getDumpDirectory()).getParent().getParent().getParent(), work.getSourceDbName().toLowerCase(), conf, null, true);
    }
    // pass the current time at the end of repl-load stage as the starting time of the first event.
    long currentTimestamp = System.currentTimeMillis();
    ((IncrementalLoadLogger) work.incrementalLoadTasksBuilder().getReplLogger()).initiateEventTimestamp(currentTimestamp);
    LOG.info("REPL_INCREMENTAL_LOAD stage duration : {} ms", currentTimestamp - loadStartTime);
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) TaskTracker(org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker) Task(org.apache.hadoop.hive.ql.exec.Task) ReplicationSpec(org.apache.hadoop.hive.ql.parse.ReplicationSpec) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) ArrayList(java.util.ArrayList) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) TException(org.apache.thrift.TException) IOException(java.io.IOException) LoadConstraint(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadConstraint) IncrementalLoadLogger(org.apache.hadoop.hive.ql.parse.repl.load.log.IncrementalLoadLogger) Hive(org.apache.hadoop.hive.ql.metadata.Hive) IncrementalLoadTasksBuilder(org.apache.hadoop.hive.ql.exec.repl.incremental.IncrementalLoadTasksBuilder) DDLWork(org.apache.hadoop.hive.ql.ddl.DDLWork) LoadDatabase(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase) Database(org.apache.hadoop.hive.metastore.api.Database) AlterDatabase(org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase.AlterDatabase) AlterDatabaseSetPropertiesDesc(org.apache.hadoop.hive.ql.ddl.database.alter.poperties.AlterDatabaseSetPropertiesDesc) HashSet(java.util.HashSet) AddDependencyToLeaves(org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves)

Example 5 with TaskTracker

use of org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker in project hive by apache.

the class ReplDumpTask method initiateDataCopyTasks.

private void initiateDataCopyTasks() throws HiveException, IOException {
    TaskTracker taskTracker = new TaskTracker(conf.getIntVar(HiveConf.ConfVars.REPL_APPROX_MAX_LOAD_TASKS));
    if (childTasks == null) {
        childTasks = new ArrayList<>();
    }
    List<Task<?>> externalTableCopyTasks = work.externalTableCopyTasks(taskTracker, conf);
    childTasks.addAll(externalTableCopyTasks);
    LOG.debug("Scheduled {} external table copy tasks", externalTableCopyTasks.size());
    // If external table data copy tasks are present add a task to mark the end of data copy
    if (!externalTableCopyTasks.isEmpty() && !work.getExternalTblCopyPathIterator().hasNext()) {
        ReplUtils.addLoggerTask(work.getReplLogger(), childTasks, conf);
    }
    childTasks.addAll(work.managedTableCopyTasks(taskTracker, conf));
    childTasks.addAll(work.functionsBinariesCopyTasks(taskTracker, conf));
    if (childTasks.isEmpty()) {
        // All table data copy work finished.
        finishRemainingTasks();
    } else {
        DAGTraversal.traverse(childTasks, new AddDependencyToLeaves(TaskFactory.get(work, conf)));
    }
}
Also used : TaskTracker(org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker) Task(org.apache.hadoop.hive.ql.exec.Task) AddDependencyToLeaves(org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves)

Aggregations

TaskTracker (org.apache.hadoop.hive.ql.exec.repl.util.TaskTracker)8 Path (org.apache.hadoop.fs.Path)4 LoadConstraint (org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadConstraint)3 AddDependencyToLeaves (org.apache.hadoop.hive.ql.exec.repl.util.AddDependencyToLeaves)3 FileNotFoundException (java.io.FileNotFoundException)2 Task (org.apache.hadoop.hive.ql.exec.Task)2 LoadDatabase (org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase)2 AlterDatabase (org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.LoadDatabase.AlterDatabase)2 LoadPartitions (org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.LoadPartitions)2 TableContext (org.apache.hadoop.hive.ql.exec.repl.bootstrap.load.table.TableContext)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 LinkedHashMap (java.util.LinkedHashMap)1 ReplScope (org.apache.hadoop.hive.common.repl.ReplScope)1 Database (org.apache.hadoop.hive.metastore.api.Database)1 DDLWork (org.apache.hadoop.hive.ql.ddl.DDLWork)1 AlterDatabaseSetPropertiesDesc (org.apache.hadoop.hive.ql.ddl.database.alter.poperties.AlterDatabaseSetPropertiesDesc)1 BootstrapEvent (org.apache.hadoop.hive.ql.exec.repl.bootstrap.events.BootstrapEvent)1