Search in sources :

Example 1 with FileList

use of org.apache.hadoop.hive.ql.exec.repl.util.FileList in project hive by apache.

the class ReplDumpTask method incrementalDump.

private Long incrementalDump(Path dumpRoot, DumpMetaData dmd, Path cmRoot, Hive hiveDb) throws Exception {
    // get list of events matching dbPattern & tblPattern
    Long lastReplId;
    // go through each event, and dump out each event to a event-level dump dir inside dumproot
    String validTxnList = null;
    long waitUntilTime = 0;
    long bootDumpBeginReplId = -1;
    List<String> tableList = work.replScope.includeAllTables() ? null : new ArrayList<>();
    SnapshotUtils.ReplSnapshotCount snapshotCount = null;
    // and wait only for the remaining time if any.
    if (needBootstrapAcidTablesDuringIncrementalDump()) {
        work.setBootstrap(true);
        bootDumpBeginReplId = queryState.getConf().getLong(ReplUtils.LAST_REPL_ID_KEY, -1L);
        assert (bootDumpBeginReplId >= 0);
        LOG.info("Dump for bootstrapping ACID tables during an incremental dump for db {}", work.dbNameOrPattern);
        long timeoutInMs = HiveConf.getTimeVar(conf, HiveConf.ConfVars.REPL_BOOTSTRAP_DUMP_OPEN_TXN_TIMEOUT, TimeUnit.MILLISECONDS);
        waitUntilTime = System.currentTimeMillis() + timeoutInMs;
    }
    // TODO : instead of simply restricting by message format, we should eventually
    // move to a jdbc-driver-stype registering of message format, and picking message
    // factory per event to decode. For now, however, since all messages have the
    // same factory, restricting by message format is effectively a guard against
    // older leftover data that would cause us problems.
    String dbName = work.dbNameOrPattern;
    Database db = hiveDb.getDatabase(dbName);
    if (!HiveConf.getBoolVar(conf, REPL_DUMP_METADATA_ONLY)) {
        setReplSourceFor(hiveDb, dbName, db);
    }
    if (shouldFailover()) {
        if (!MetaStoreUtils.isDbBeingFailedOver(db)) {
            setReplFailoverEnabledAtSource(db);
        }
        fetchFailoverMetadata(hiveDb);
        assert work.getFailoverMetadata().isValidMetadata();
        work.overrideLastEventToDump(hiveDb, bootDumpBeginReplId, work.getFailoverMetadata().getFailoverEventId());
    } else {
        work.overrideLastEventToDump(hiveDb, bootDumpBeginReplId, -1);
    }
    IMetaStoreClient.NotificationFilter evFilter = new AndFilter(new ReplEventFilter(work.replScope), new CatalogFilter(MetaStoreUtils.getDefaultCatalog(conf)), new EventBoundaryFilter(work.eventFrom, work.eventTo));
    EventUtils.MSClientNotificationFetcher evFetcher = new EventUtils.MSClientNotificationFetcher(hiveDb);
    int maxEventLimit = getMaxEventAllowed(work.maxEventLimit());
    EventUtils.NotificationEventIterator evIter = new EventUtils.NotificationEventIterator(evFetcher, work.eventFrom, maxEventLimit, evFilter);
    lastReplId = work.eventTo;
    Path ackFile = new Path(dumpRoot, ReplAck.EVENTS_DUMP.toString());
    long resumeFrom = Utils.fileExists(ackFile, conf) ? getResumeFrom(ackFile) : work.eventFrom;
    long estimatedNumEvents = evFetcher.getDbNotificationEventsCount(work.eventFrom, dbName, work.eventTo, maxEventLimit);
    try {
        IncrementalDumpLogger replLogger = new IncrementalDumpLogger(dbName, dumpRoot.toString(), estimatedNumEvents, work.eventFrom, work.eventTo, maxEventLimit);
        work.setReplLogger(replLogger);
        replLogger.startLog();
        Map<String, Long> metricMap = new HashMap<>();
        metricMap.put(ReplUtils.MetricName.EVENTS.name(), estimatedNumEvents);
        if (conf.getBoolVar(HiveConf.ConfVars.HIVE_REPL_FAILOVER_START)) {
            work.getMetricCollector().reportFailoverStart(getName(), metricMap, work.getFailoverMetadata());
        } else {
            work.getMetricCollector().reportStageStart(getName(), metricMap);
        }
        long dumpedCount = resumeFrom - work.eventFrom;
        if (dumpedCount > 0) {
            LOG.info("Event id {} to {} are already dumped, skipping {} events", work.eventFrom, resumeFrom, dumpedCount);
        }
        cleanFailedEventDirIfExists(dumpRoot, resumeFrom);
        while (evIter.hasNext()) {
            NotificationEvent ev = evIter.next();
            lastReplId = ev.getEventId();
            if (ev.getEventId() <= resumeFrom) {
                continue;
            }
            // disable materialized-view replication if not configured
            if (!isMaterializedViewsReplEnabled()) {
                String tblName = ev.getTableName();
                if (tblName != null) {
                    try {
                        Table table = hiveDb.getTable(dbName, tblName);
                        if (table != null && TableType.MATERIALIZED_VIEW.equals(table.getTableType())) {
                            LOG.info("Attempt to dump materialized view : " + tblName);
                            continue;
                        }
                    } catch (InvalidTableException te) {
                        LOG.debug(te.getMessage());
                    }
                }
            }
            Path evRoot = new Path(dumpRoot, String.valueOf(lastReplId));
            dumpEvent(ev, evRoot, dumpRoot, cmRoot, hiveDb);
            Utils.writeOutput(String.valueOf(lastReplId), ackFile, conf);
        }
        replLogger.endLog(lastReplId.toString());
        LOG.info("Done dumping events, preparing to return {},{}", dumpRoot.toUri(), lastReplId);
    } finally {
        // write the dmd always irrespective of success/failure to enable checkpointing in table level replication
        long executionId = conf.getLong(Constants.SCHEDULED_QUERY_EXECUTIONID, 0L);
        dmd.setDump(DumpType.INCREMENTAL, work.eventFrom, lastReplId, cmRoot, executionId, previousReplScopeModified());
        // If repl policy is changed (oldReplScope is set), then pass the current replication policy,
        // so that REPL LOAD would drop the tables which are not included in current policy.
        dmd.setReplScope(work.replScope);
        dmd.write(true);
    }
    // Get snapshot related configurations for external data copy.
    boolean isSnapshotEnabled = conf.getBoolVar(REPL_SNAPSHOT_DIFF_FOR_EXTERNAL_TABLE_COPY);
    String snapshotPrefix = dbName.toLowerCase();
    ArrayList<String> prevSnaps = new ArrayList<>();
    try (FileList managedTblList = createTableFileList(dumpRoot, EximUtil.FILE_LIST, conf);
        FileList extTableFileList = createTableFileList(dumpRoot, EximUtil.FILE_LIST_EXTERNAL, conf);
        FileList snapPathFileList = isSnapshotEnabled ? createTableFileList(SnapshotUtils.getSnapshotFileListPath(dumpRoot), EximUtil.FILE_LIST_EXTERNAL_SNAPSHOT_CURRENT, conf) : null) {
        // Examine all the tables if required.
        if (shouldExamineTablesToDump() || (tableList != null)) {
            // If required wait more for any transactions open at the time of starting the ACID bootstrap.
            if (needBootstrapAcidTablesDuringIncrementalDump()) {
                assert (waitUntilTime > 0);
                validTxnList = getValidTxnListForReplDump(hiveDb, waitUntilTime);
            }
            /* When same dump dir is resumed because of check-pointing, we need to clear the existing metadata.
      We need to rewrite the metadata as the write id list will be changed.
      We can't reuse the previous write id as it might be invalid due to compaction. */
            Path bootstrapRoot = new Path(dumpRoot, ReplUtils.INC_BOOTSTRAP_ROOT_DIR_NAME);
            Path metadataPath = new Path(bootstrapRoot, EximUtil.METADATA_PATH_NAME);
            FileSystem fs = FileSystem.get(metadataPath.toUri(), conf);
            try {
                fs.delete(metadataPath, true);
            } catch (FileNotFoundException e) {
            // no worries
            }
            Path dbRootMetadata = new Path(metadataPath, dbName);
            Path dbRootData = new Path(bootstrapRoot, EximUtil.DATA_PATH_NAME + File.separator + dbName);
            boolean dataCopyAtLoad = conf.getBoolVar(HiveConf.ConfVars.REPL_RUN_DATA_COPY_TASKS_ON_TARGET);
            ReplExternalTables externalTablesWriter = new ReplExternalTables(conf);
            boolean isSingleTaskForExternalDb = conf.getBoolVar(REPL_EXTERNAL_WAREHOUSE_SINGLE_COPY_TASK) && work.replScope.includeAllTables();
            HashMap<String, Boolean> singleCopyPaths = getNonTableLevelCopyPaths(db, isSingleTaskForExternalDb);
            boolean isExternalTablePresent = false;
            if (isSnapshotEnabled) {
                snapshotCount = new SnapshotUtils.ReplSnapshotCount();
                if (snapPathFileList.hasNext()) {
                    prevSnaps = getListFromFileList(snapPathFileList);
                }
            }
            for (String matchedDbName : Utils.matchesDb(hiveDb, work.dbNameOrPattern)) {
                for (String tableName : Utils.matchesTbl(hiveDb, matchedDbName, work.replScope)) {
                    try {
                        Table table = hiveDb.getTable(matchedDbName, tableName);
                        // Dump external table locations if required.
                        if (TableType.EXTERNAL_TABLE.equals(table.getTableType()) && shouldDumpExternalTableLocation(conf)) {
                            externalTablesWriter.dataLocationDump(table, extTableFileList, singleCopyPaths, !isSingleTaskForExternalDb, conf);
                            isExternalTablePresent = true;
                        }
                        // Dump the table to be bootstrapped if required.
                        if (shouldBootstrapDumpTable(table)) {
                            HiveWrapper.Tuple<Table> tableTuple = new HiveWrapper(hiveDb, matchedDbName).table(table);
                            dumpTable(matchedDbName, tableName, validTxnList, dbRootMetadata, dbRootData, bootDumpBeginReplId, hiveDb, tableTuple, managedTblList, dataCopyAtLoad);
                        }
                        if (tableList != null && isTableSatifiesConfig(table)) {
                            tableList.add(tableName);
                        }
                    } catch (InvalidTableException te) {
                        // Repl dump shouldn't fail if the table is dropped/renamed while dumping it.
                        // Just log a debug message and skip it.
                        LOG.debug(te.getMessage());
                    }
                }
                // the database default location and the paths configured.
                if (isExternalTablePresent && shouldDumpExternalTableLocation(conf) && isSingleTaskForExternalDb) {
                    externalTablesWriter.dumpNonTableLevelCopyPaths(singleCopyPaths, extTableFileList, conf, isSnapshotEnabled, snapshotPrefix, snapshotCount, snapPathFileList, prevSnaps, false);
                }
            }
            dumpTableListToDumpLocation(tableList, dumpRoot, dbName, conf);
        }
        setDataCopyIterators(extTableFileList, managedTblList);
        work.getMetricCollector().reportStageEnd(getName(), Status.SUCCESS, lastReplId, snapshotCount, null);
        // Clean-up snapshots
        if (isSnapshotEnabled) {
            cleanupSnapshots(SnapshotUtils.getSnapshotFileListPath(dumpRoot), work.dbNameOrPattern.toLowerCase(), conf, snapshotCount, false);
        }
        return lastReplId;
    }
}
Also used : EventBoundaryFilter(org.apache.hadoop.hive.metastore.messaging.event.filters.EventBoundaryFilter) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) HiveWrapper(org.apache.hadoop.hive.ql.parse.repl.dump.HiveWrapper) ArrayList(java.util.ArrayList) FileNotFoundException(java.io.FileNotFoundException) ReplChangeManager.getReplPolicyIdString(org.apache.hadoop.hive.metastore.ReplChangeManager.getReplPolicyIdString) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) SnapshotUtils(org.apache.hadoop.hive.ql.exec.repl.util.SnapshotUtils) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException) FileSystem(org.apache.hadoop.fs.FileSystem) OptimisedBootstrapUtils.getReplEventIdFromDatabase(org.apache.hadoop.hive.ql.exec.repl.OptimisedBootstrapUtils.getReplEventIdFromDatabase) Database(org.apache.hadoop.hive.metastore.api.Database) Path(org.apache.hadoop.fs.Path) Table(org.apache.hadoop.hive.ql.metadata.Table) FileList(org.apache.hadoop.hive.ql.exec.repl.util.FileList) SnapshotUtils.getListFromFileList(org.apache.hadoop.hive.ql.exec.repl.util.SnapshotUtils.getListFromFileList) EventUtils(org.apache.hadoop.hive.ql.metadata.events.EventUtils) NotificationEvent(org.apache.hadoop.hive.metastore.api.NotificationEvent) ReplEventFilter(org.apache.hadoop.hive.metastore.messaging.event.filters.ReplEventFilter) AndFilter(org.apache.hadoop.hive.metastore.messaging.event.filters.AndFilter) CatalogFilter(org.apache.hadoop.hive.metastore.messaging.event.filters.CatalogFilter) IncrementalDumpLogger(org.apache.hadoop.hive.ql.parse.repl.dump.log.IncrementalDumpLogger)

Example 2 with FileList

use of org.apache.hadoop.hive.ql.exec.repl.util.FileList in project hive by apache.

the class ReplDumpTask method bootStrapDump.

Long bootStrapDump(Path dumpRoot, DumpMetaData dmd, Path cmRoot, Hive hiveDb) throws Exception {
    // bootstrap case
    // Last repl id would've been captured during compile phase in queryState configs before opening txn.
    // This is needed as we dump data on ACID/MM tables based on read snapshot or else we may lose data from
    // concurrent txns when bootstrap dump in progress. If it is not available, then get it from metastore.
    Long bootDumpBeginReplId = queryState.getConf().getLong(ReplUtils.LAST_REPL_ID_KEY, -1L);
    assert (bootDumpBeginReplId >= 0L);
    List<String> tableList;
    SnapshotUtils.ReplSnapshotCount replSnapshotCount = null;
    LOG.info("Bootstrap Dump for db {}", work.dbNameOrPattern);
    long timeoutInMs = HiveConf.getTimeVar(conf, HiveConf.ConfVars.REPL_BOOTSTRAP_DUMP_OPEN_TXN_TIMEOUT, TimeUnit.MILLISECONDS);
    long waitUntilTime = System.currentTimeMillis() + timeoutInMs;
    String validTxnList = getValidTxnListForReplDump(hiveDb, waitUntilTime);
    Path metadataPath = new Path(dumpRoot, EximUtil.METADATA_PATH_NAME);
    if (shouldResumePreviousDump(dmd)) {
        // clear the metadata. We need to rewrite the metadata as the write id list will be changed
        // We can't reuse the previous write id as it might be invalid due to compaction
        metadataPath.getFileSystem(conf).delete(metadataPath, true);
    }
    List<EximUtil.DataCopyPath> functionsBinaryCopyPaths = Collections.emptyList();
    boolean isSnapshotEnabled = conf.getBoolVar(REPL_SNAPSHOT_DIFF_FOR_EXTERNAL_TABLE_COPY);
    // Create SnapPathFileList only if snapshots are enabled.
    try (FileList managedTblList = createTableFileList(dumpRoot, EximUtil.FILE_LIST, conf);
        FileList extTableFileList = createTableFileList(dumpRoot, EximUtil.FILE_LIST_EXTERNAL, conf);
        FileList snapPathFileList = isSnapshotEnabled ? createTableFileList(SnapshotUtils.getSnapshotFileListPath(dumpRoot), EximUtil.FILE_LIST_EXTERNAL_SNAPSHOT_CURRENT, conf) : null) {
        for (String dbName : Utils.matchesDb(hiveDb, work.dbNameOrPattern)) {
            LOG.debug("Dumping db: " + dbName);
            // TODO : Currently we don't support separate table list for each database.
            tableList = work.replScope.includeAllTables() ? null : new ArrayList<>();
            Database db = hiveDb.getDatabase(dbName);
            if ((db != null) && (ReplUtils.isFirstIncPending(db.getParameters()))) {
                // in a consistent state. Avoid allowing replicating this database to a new target.
                throw new HiveException("Replication dump not allowed for replicated database" + " with first incremental dump pending : " + dbName);
            }
            if (db != null && !HiveConf.getBoolVar(conf, REPL_DUMP_METADATA_ONLY)) {
                setReplSourceFor(hiveDb, dbName, db);
            }
            int estimatedNumTables = Utils.getAllTables(hiveDb, dbName, work.replScope).size();
            int estimatedNumFunctions = hiveDb.getFunctions(dbName, "*").size();
            BootstrapDumpLogger replLogger = new BootstrapDumpLogger(dbName, dumpRoot.toString(), estimatedNumTables, estimatedNumFunctions);
            work.setReplLogger(replLogger);
            replLogger.startLog();
            Map<String, Long> metricMap = new HashMap<>();
            metricMap.put(ReplUtils.MetricName.TABLES.name(), (long) estimatedNumTables);
            metricMap.put(ReplUtils.MetricName.FUNCTIONS.name(), (long) estimatedNumFunctions);
            work.getMetricCollector().reportStageStart(getName(), metricMap);
            Path dbRoot = dumpDbMetadata(dbName, metadataPath, bootDumpBeginReplId, hiveDb);
            Path dbDataRoot = new Path(new Path(dumpRoot, EximUtil.DATA_PATH_NAME), dbName);
            boolean dataCopyAtLoad = conf.getBoolVar(HiveConf.ConfVars.REPL_RUN_DATA_COPY_TASKS_ON_TARGET);
            functionsBinaryCopyPaths = dumpFunctionMetadata(dbName, dbRoot, dbDataRoot, hiveDb, dataCopyAtLoad);
            String uniqueKey = Utils.setDbBootstrapDumpState(hiveDb, dbName);
            Exception caught = null;
            try {
                ReplExternalTables externalTablesWriter = new ReplExternalTables(conf);
                boolean isSingleTaskForExternalDb = conf.getBoolVar(REPL_EXTERNAL_WAREHOUSE_SINGLE_COPY_TASK) && work.replScope.includeAllTables();
                // Generate snapshot related configurations for external table data copy.
                HashMap<String, Boolean> singleCopyPaths = getNonTableLevelCopyPaths(db, isSingleTaskForExternalDb);
                boolean isExternalTablePresent = false;
                String snapshotPrefix = dbName.toLowerCase();
                // Will stay empty in case of bootstrap
                ArrayList<String> prevSnaps = new ArrayList<>();
                if (isSnapshotEnabled) {
                    // Delete any old existing snapshot file, We always start fresh in case of bootstrap.
                    FileUtils.deleteIfExists(getDFS(SnapshotUtils.getSnapshotFileListPath(dumpRoot), conf), new Path(SnapshotUtils.getSnapshotFileListPath(dumpRoot), EximUtil.FILE_LIST_EXTERNAL_SNAPSHOT_CURRENT));
                    FileUtils.deleteIfExists(getDFS(SnapshotUtils.getSnapshotFileListPath(dumpRoot), conf), new Path(SnapshotUtils.getSnapshotFileListPath(dumpRoot), EximUtil.FILE_LIST_EXTERNAL_SNAPSHOT_OLD));
                    // Get the counter to store the snapshots created & deleted at source.
                    replSnapshotCount = new SnapshotUtils.ReplSnapshotCount();
                }
                for (String tblName : Utils.matchesTbl(hiveDb, dbName, work.replScope)) {
                    Table table = null;
                    try {
                        HiveWrapper.Tuple<Table> tableTuple = new HiveWrapper(hiveDb, dbName).table(tblName, conf);
                        table = tableTuple != null ? tableTuple.object : null;
                        // disable materialized-view replication if not configured
                        if (tableTuple != null && !isMaterializedViewsReplEnabled() && TableType.MATERIALIZED_VIEW.equals(tableTuple.object.getTableType())) {
                            LOG.info("Attempt to dump materialized view : " + tblName);
                            continue;
                        }
                        LOG.debug("Dumping table: " + tblName + " to db root " + dbRoot.toUri());
                        if (shouldDumpExternalTableLocation(conf) && TableType.EXTERNAL_TABLE.equals(tableTuple.object.getTableType())) {
                            LOG.debug("Adding table {} to external tables list", tblName);
                            externalTablesWriter.dataLocationDump(tableTuple.object, extTableFileList, singleCopyPaths, !isSingleTaskForExternalDb, conf);
                            isExternalTablePresent = true;
                        }
                        dumpTable(dbName, tblName, validTxnList, dbRoot, dbDataRoot, bootDumpBeginReplId, hiveDb, tableTuple, managedTblList, dataCopyAtLoad);
                    } catch (InvalidTableException te) {
                        // Bootstrap dump shouldn't fail if the table is dropped/renamed while dumping it.
                        // Just log a debug message and skip it.
                        LOG.debug(te.getMessage());
                    }
                    dumpConstraintMetadata(dbName, tblName, dbRoot, hiveDb, table != null ? table.getTTable().getId() : -1);
                    if (tableList != null && isTableSatifiesConfig(table)) {
                        tableList.add(tblName);
                    }
                }
                // the database default location and for the configured paths for external tables.
                if (isExternalTablePresent && shouldDumpExternalTableLocation(conf) && isSingleTaskForExternalDb) {
                    externalTablesWriter.dumpNonTableLevelCopyPaths(singleCopyPaths, extTableFileList, conf, isSnapshotEnabled, snapshotPrefix, replSnapshotCount, snapPathFileList, prevSnaps, true);
                }
                dumpTableListToDumpLocation(tableList, dumpRoot, dbName, conf);
            } catch (Exception e) {
                caught = e;
            } finally {
                try {
                    Utils.resetDbBootstrapDumpState(hiveDb, dbName, uniqueKey);
                } catch (Exception e) {
                    if (caught == null) {
                        throw e;
                    } else {
                        LOG.error("failed to reset the db state for " + uniqueKey + " on failure of repl dump", e);
                        throw caught;
                    }
                }
                if (caught != null) {
                    throw caught;
                }
            }
            replLogger.endLog(bootDumpBeginReplId.toString());
            work.getMetricCollector().reportStageEnd(getName(), Status.SUCCESS, bootDumpBeginReplId, replSnapshotCount, replLogger.getReplStatsTracker());
        }
        work.setFunctionCopyPathIterator(functionsBinaryCopyPaths.iterator());
        setDataCopyIterators(extTableFileList, managedTblList);
        LOG.info("Preparing to return {},{}->{}", dumpRoot.toUri(), bootDumpBeginReplId, currentNotificationId(hiveDb));
        return bootDumpBeginReplId;
    } finally {
        // write the dmd always irrespective of success/failure to enable checkpointing in table level replication
        Long bootDumpEndReplId = currentNotificationId(hiveDb);
        long executorId = conf.getLong(Constants.SCHEDULED_QUERY_EXECUTIONID, 0L);
        dmd.setDump(DumpType.BOOTSTRAP, bootDumpBeginReplId, bootDumpEndReplId, cmRoot, executorId, previousReplScopeModified());
        dmd.setReplScope(work.replScope);
        dmd.write(true);
    }
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) BootstrapDumpLogger(org.apache.hadoop.hive.ql.parse.repl.dump.log.BootstrapDumpLogger) HiveWrapper(org.apache.hadoop.hive.ql.parse.repl.dump.HiveWrapper) ArrayList(java.util.ArrayList) ReplChangeManager.getReplPolicyIdString(org.apache.hadoop.hive.metastore.ReplChangeManager.getReplPolicyIdString) SnapshotUtils(org.apache.hadoop.hive.ql.exec.repl.util.SnapshotUtils) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException) OptimisedBootstrapUtils.getReplEventIdFromDatabase(org.apache.hadoop.hive.ql.exec.repl.OptimisedBootstrapUtils.getReplEventIdFromDatabase) Database(org.apache.hadoop.hive.metastore.api.Database) Path(org.apache.hadoop.fs.Path) Table(org.apache.hadoop.hive.ql.metadata.Table) FileList(org.apache.hadoop.hive.ql.exec.repl.util.FileList) SnapshotUtils.getListFromFileList(org.apache.hadoop.hive.ql.exec.repl.util.SnapshotUtils.getListFromFileList) LockException(org.apache.hadoop.hive.ql.lockmgr.LockException) TException(org.apache.thrift.TException) IOException(java.io.IOException) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) SnapshotException(org.apache.hadoop.hdfs.protocol.SnapshotException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException)

Example 3 with FileList

use of org.apache.hadoop.hive.ql.exec.repl.util.FileList in project hive by apache.

the class TestReplDumpTask method removeDBPropertyToPreventRenameWhenBootstrapDumpOfTableFails.

@Test(expected = TestException.class)
public void removeDBPropertyToPreventRenameWhenBootstrapDumpOfTableFails() throws Exception {
    List<String> tableList = Arrays.asList("a1", "a2");
    String dbRandomKey = "akeytoberandom";
    ReplScope replScope = new ReplScope("default");
    mockStatic(Utils.class);
    when(Utils.matchesDb(same(hive), eq("default"))).thenReturn(Collections.singletonList("default"));
    when(Utils.getAllTables(same(hive), eq("default"), eq(replScope))).thenReturn(tableList);
    when(Utils.setDbBootstrapDumpState(same(hive), eq("default"))).thenReturn(dbRandomKey);
    when(Utils.matchesTbl(same(hive), eq("default"), eq(replScope))).thenReturn(tableList);
    when(hive.getAllFunctions()).thenReturn(Collections.emptyList());
    when(queryState.getConf()).thenReturn(conf);
    when(conf.getLong("hive.repl.last.repl.id", -1L)).thenReturn(1L);
    when(conf.getBoolVar(HiveConf.ConfVars.REPL_INCLUDE_EXTERNAL_TABLES)).thenReturn(false);
    when(HiveConf.getVar(conf, HiveConf.ConfVars.REPL_BOOTSTRAP_DUMP_OPEN_TXN_TIMEOUT)).thenReturn("1h");
    whenNew(HiveWrapper.class).withAnyArguments().thenReturn(mock(HiveWrapper.class));
    ReplDumpTask task = new StubReplDumpTask() {

        private int tableDumpCount = 0;

        @Override
        void dumpTable(String dbName, String tblName, String validTxnList, Path dbRootMetadata, Path dbRootData, long lastReplId, Hive hiveDb, HiveWrapper.Tuple<Table> tuple, FileList managedTableDirFileList, boolean dataCopyAtLoad) throws Exception {
            tableDumpCount++;
            if (tableDumpCount > 1) {
                throw new TestException();
            }
        }
    };
    task.initialize(queryState, null, null, null);
    ReplDumpWork replDumpWork = new ReplDumpWork(replScope, "", "");
    replDumpWork.setMetricCollector(metricCollector);
    task.setWork(replDumpWork);
    try {
        task.bootStrapDump(new Path("mock"), new DumpMetaData(new Path("mock"), conf), mock(Path.class), hive);
    } finally {
        Utils.resetDbBootstrapDumpState(same(hive), eq("default"), eq(dbRandomKey));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ReplScope(org.apache.hadoop.hive.common.repl.ReplScope) FileList(org.apache.hadoop.hive.ql.exec.repl.util.FileList) HiveWrapper(org.apache.hadoop.hive.ql.parse.repl.dump.HiveWrapper) DumpMetaData(org.apache.hadoop.hive.ql.parse.repl.load.DumpMetaData) Hive(org.apache.hadoop.hive.ql.metadata.Hive) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest) Test(org.junit.Test)

Example 4 with FileList

use of org.apache.hadoop.hive.ql.exec.repl.util.FileList in project hive by apache.

the class ReplLoadTask method addLazyDataCopyTask.

private void addLazyDataCopyTask(TaskTracker loadTaskTracker, ReplLogger replLogger) throws IOException {
    boolean dataCopyAtLoad = conf.getBoolVar(HiveConf.ConfVars.REPL_RUN_DATA_COPY_TASKS_ON_TARGET);
    if (dataCopyAtLoad) {
        if (work.getExternalTableDataCopyItr() == null) {
            Path extTableBackingFile = new Path(work.dumpDirectory, EximUtil.FILE_LIST_EXTERNAL);
            try (FileList fileList = new FileList(extTableBackingFile, conf)) {
                work.setExternalTableDataCopyItr(fileList);
            }
        }
        if (childTasks == null) {
            childTasks = new ArrayList<>();
        }
        List<Task<?>> externalTableCopyTasks = work.externalTableCopyTasks(loadTaskTracker, conf);
        LOG.debug("Scheduled {} external table copy tasks", externalTableCopyTasks.size());
        childTasks.addAll(externalTableCopyTasks);
        // If external table data copy tasks are present add a task to mark the end of data copy
        if (!externalTableCopyTasks.isEmpty() && !work.getExternalTableDataCopyItr().hasNext()) {
            ReplUtils.addLoggerTask(replLogger, childTasks, conf);
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Task(org.apache.hadoop.hive.ql.exec.Task) FileList(org.apache.hadoop.hive.ql.exec.repl.util.FileList)

Example 5 with FileList

use of org.apache.hadoop.hive.ql.exec.repl.util.FileList in project hive by apache.

the class PartitionExport method write.

List<DataCopyPath> write(final ReplicationSpec forReplicationSpec, boolean isExportTask, FileList fileList, boolean dataCopyAtLoad) throws InterruptedException, HiveException {
    List<Future<?>> futures = new LinkedList<>();
    List<DataCopyPath> managedTableCopyPaths = new LinkedList<>();
    ExecutorService producer = Executors.newFixedThreadPool(1, new ThreadFactoryBuilder().setNameFormat("partition-submitter-thread-%d").build());
    futures.add(producer.submit(() -> {
        SessionState.setCurrentSessionState(callersSession);
        for (Partition partition : partitionIterable) {
            try {
                queue.put(partition);
            } catch (InterruptedException e) {
                throw new RuntimeException("Error while queuing up the partitions for export of data files", e);
            }
        }
    }));
    producer.shutdown();
    ThreadFactory namingThreadFactory = new ThreadFactoryBuilder().setNameFormat("partition-dump-thread-%d").build();
    ExecutorService consumer = Executors.newFixedThreadPool(nThreads, namingThreadFactory);
    while (!producer.isTerminated() || !queue.isEmpty()) {
        /*
      This is removed using a poll because there can be a case where there partitions iterator is empty
      but because both the producer and consumer are started simultaneously the while loop will execute
      because producer is not terminated but it wont produce anything so queue will be empty and then we
      should only wait for a specific time before continuing, as the next loop cycle will fail.
       */
        Partition partition = queue.poll(1, TimeUnit.SECONDS);
        if (partition == null) {
            continue;
        }
        LOG.debug("scheduling partition dump {}", partition.getName());
        futures.add(consumer.submit(() -> {
            String partitionName = partition.getName();
            String threadName = Thread.currentThread().getName();
            LOG.debug("Thread: {}, start partition dump {}", threadName, partitionName);
            try {
                // Data Copy in case of ExportTask or when dataCopyAtLoad is true
                List<Path> dataPathList = Utils.getDataPathList(partition.getDataLocation(), forReplicationSpec, hiveConf);
                Path rootDataDumpDir = isExportTask ? paths.partitionMetadataExportDir(partitionName) : paths.partitionDataExportDir(partitionName);
                new FileOperations(dataPathList, rootDataDumpDir, distCpDoAsUser, hiveConf, mmCtx).export(isExportTask, dataCopyAtLoad);
                Path dataDumpDir = new Path(paths.dataExportRootDir(), partitionName);
                LOG.debug("Thread: {}, finish partition dump {}", threadName, partitionName);
                if (!(isExportTask || dataCopyAtLoad)) {
                    fileList.add(new DataCopyPath(forReplicationSpec, partition.getDataLocation(), dataDumpDir).convertToString());
                }
            } catch (Exception e) {
                throw new RuntimeException(e.getMessage(), e);
            }
        }));
    }
    consumer.shutdown();
    for (Future<?> future : futures) {
        try {
            future.get();
        } catch (Exception e) {
            LOG.error("failed", e.getCause());
            throw new HiveException(e.getCause().getMessage(), e.getCause());
        }
    }
    // may be drive this via configuration as well.
    consumer.awaitTermination(Long.MAX_VALUE, TimeUnit.SECONDS);
    return managedTableCopyPaths;
}
Also used : DataCopyPath(org.apache.hadoop.hive.ql.parse.EximUtil.DataCopyPath) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) ThreadFactory(java.util.concurrent.ThreadFactory) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileOperations(org.apache.hadoop.hive.ql.parse.repl.dump.io.FileOperations) LinkedList(java.util.LinkedList) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) DataCopyPath(org.apache.hadoop.hive.ql.parse.EximUtil.DataCopyPath) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) FileList(org.apache.hadoop.hive.ql.exec.repl.util.FileList) List(java.util.List) LinkedList(java.util.LinkedList)

Aggregations

Path (org.apache.hadoop.fs.Path)5 FileList (org.apache.hadoop.hive.ql.exec.repl.util.FileList)5 HiveWrapper (org.apache.hadoop.hive.ql.parse.repl.dump.HiveWrapper)3 FileNotFoundException (java.io.FileNotFoundException)2 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 LinkedHashMap (java.util.LinkedHashMap)2 ReplChangeManager.getReplPolicyIdString (org.apache.hadoop.hive.metastore.ReplChangeManager.getReplPolicyIdString)2 Database (org.apache.hadoop.hive.metastore.api.Database)2 OptimisedBootstrapUtils.getReplEventIdFromDatabase (org.apache.hadoop.hive.ql.exec.repl.OptimisedBootstrapUtils.getReplEventIdFromDatabase)2 SnapshotUtils (org.apache.hadoop.hive.ql.exec.repl.util.SnapshotUtils)2 SnapshotUtils.getListFromFileList (org.apache.hadoop.hive.ql.exec.repl.util.SnapshotUtils.getListFromFileList)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 InvalidTableException (org.apache.hadoop.hive.ql.metadata.InvalidTableException)2 Table (org.apache.hadoop.hive.ql.metadata.Table)2 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 IOException (java.io.IOException)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 ExecutorService (java.util.concurrent.ExecutorService)1