Search in sources :

Example 1 with HoodieTableMetadata

use of org.apache.hudi.metadata.HoodieTableMetadata in project hudi by apache.

the class HoodieClientTestHarness method validateMetadata.

/**
 * Validate the metadata tables contents to ensure it matches what is on the file system.
 */
public void validateMetadata(HoodieTestTable testTable, List<String> inflightCommits, HoodieWriteConfig writeConfig, String metadataTableBasePath, boolean doFullValidation) throws IOException {
    HoodieTableMetadata tableMetadata = metadata(writeConfig, context);
    assertNotNull(tableMetadata, "MetadataReader should have been initialized");
    if (!writeConfig.isMetadataTableEnabled()) {
        return;
    }
    if (!tableMetadata.getSyncedInstantTime().isPresent() || tableMetadata instanceof FileSystemBackedTableMetadata) {
        throw new IllegalStateException("Metadata should have synced some commits or tableMetadata should not be an instance " + "of FileSystemBackedTableMetadata");
    }
    assertEquals(inflightCommits, testTable.inflightCommits());
    HoodieTimer timer = new HoodieTimer().startTimer();
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    // Partitions should match
    List<java.nio.file.Path> fsPartitionPaths = testTable.getAllPartitionPaths();
    List<String> fsPartitions = new ArrayList<>();
    fsPartitionPaths.forEach(entry -> fsPartitions.add(entry.getFileName().toString()));
    if (fsPartitions.isEmpty()) {
        fsPartitions.add("");
    }
    List<String> metadataPartitions = tableMetadata.getAllPartitionPaths();
    Collections.sort(fsPartitions);
    Collections.sort(metadataPartitions);
    assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match");
    assertEquals(fsPartitions, metadataPartitions, "Partitions should match");
    // Files within each partition should match
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTable table = HoodieSparkTable.create(writeConfig, engineContext, true);
    TableFileSystemView tableView = table.getHoodieView();
    List<String> fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList());
    Map<String, FileStatus[]> partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths);
    assertEquals(fsPartitions.size(), partitionToFilesMap.size());
    fsPartitions.forEach(partition -> {
        try {
            validateFilesPerPartition(testTable, tableMetadata, tableView, partitionToFilesMap, partition);
        } catch (IOException e) {
            fail("Exception should not be raised: " + e);
        }
    });
    if (doFullValidation) {
        runFullValidation(writeConfig, metadataTableBasePath, engineContext);
    }
    LOG.info("Validation time=" + timer.endTimer());
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Random(java.util.Random) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) AfterAll(org.junit.jupiter.api.AfterAll) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Assertions.assertLinesMatch(org.junit.jupiter.api.Assertions.assertLinesMatch) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) CleanPlanV2MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) WorkloadStat(org.apache.hudi.table.WorkloadStat) CleanerUtils.convertCleanMetadata(org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Tuple2(scala.Tuple2) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) TestInfo(org.junit.jupiter.api.TestInfo) Executors(java.util.concurrent.Executors) Serializable(java.io.Serializable) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) MiniDFSCluster(org.apache.hadoop.hdfs.MiniDFSCluster) HoodieBackedTableMetadataWriter(org.apache.hudi.metadata.HoodieBackedTableMetadataWriter) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) TimelineService(org.apache.hudi.timeline.service.TimelineService) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ExecutorService(java.util.concurrent.ExecutorService) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HdfsTestService(org.apache.hudi.common.testutils.minicluster.HdfsTestService) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) SQLContext(org.apache.spark.sql.SQLContext) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) IOException(java.io.IOException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) Assertions(org.junit.jupiter.api.Assertions) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) ArrayList(java.util.ArrayList) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) IOException(java.io.IOException) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) HoodieTable(org.apache.hudi.table.HoodieTable) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 2 with HoodieTableMetadata

use of org.apache.hudi.metadata.HoodieTableMetadata in project hudi by apache.

the class HoodieClientTestHarness method validateFilesPerPartition.

protected void validateFilesPerPartition(HoodieTestTable testTable, HoodieTableMetadata tableMetadata, TableFileSystemView tableView, Map<String, FileStatus[]> partitionToFilesMap, String partition) throws IOException {
    Path partitionPath;
    if (partition.equals("")) {
        // Should be the non-partitioned case
        partitionPath = new Path(basePath);
    } else {
        partitionPath = new Path(basePath, partition);
    }
    FileStatus[] fsStatuses = testTable.listAllFilesInPartition(partition);
    FileStatus[] metaStatuses = tableMetadata.getAllFilesInPartition(partitionPath);
    List<String> fsFileNames = Arrays.stream(fsStatuses).map(s -> s.getPath().getName()).collect(Collectors.toList());
    List<String> metadataFilenames = Arrays.stream(metaStatuses).map(s -> s.getPath().getName()).collect(Collectors.toList());
    Collections.sort(fsFileNames);
    Collections.sort(metadataFilenames);
    assertLinesMatch(fsFileNames, metadataFilenames);
    assertEquals(fsStatuses.length, partitionToFilesMap.get(partitionPath.toString()).length);
    // Block sizes should be valid
    Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getBlockSize() > 0));
    List<Long> fsBlockSizes = Arrays.stream(fsStatuses).map(FileStatus::getBlockSize).sorted().collect(Collectors.toList());
    List<Long> metadataBlockSizes = Arrays.stream(metaStatuses).map(FileStatus::getBlockSize).sorted().collect(Collectors.toList());
    assertEquals(fsBlockSizes, metadataBlockSizes);
    assertEquals(fsFileNames.size(), metadataFilenames.size(), "Files within partition " + partition + " should match");
    assertEquals(fsFileNames, metadataFilenames, "Files within partition " + partition + " should match");
    // FileSystemView should expose the same data
    List<HoodieFileGroup> fileGroups = tableView.getAllFileGroups(partition).collect(Collectors.toList());
    fileGroups.addAll(tableView.getAllReplacedFileGroups(partition).collect(Collectors.toList()));
    fileGroups.forEach(g -> LogManager.getLogger(getClass()).info(g));
    fileGroups.forEach(g -> g.getAllBaseFiles().forEach(b -> LogManager.getLogger(getClass()).info(b)));
    fileGroups.forEach(g -> g.getAllFileSlices().forEach(s -> LogManager.getLogger(getClass()).info(s)));
    long numFiles = fileGroups.stream().mapToLong(g -> g.getAllBaseFiles().count() + g.getAllFileSlices().mapToLong(s -> s.getLogFiles().count()).sum()).sum();
    assertEquals(metadataFilenames.size(), numFiles);
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Random(java.util.Random) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) AfterAll(org.junit.jupiter.api.AfterAll) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Assertions.assertLinesMatch(org.junit.jupiter.api.Assertions.assertLinesMatch) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) CleanPlanV2MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) WorkloadStat(org.apache.hudi.table.WorkloadStat) CleanerUtils.convertCleanMetadata(org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Tuple2(scala.Tuple2) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) TestInfo(org.junit.jupiter.api.TestInfo) Executors(java.util.concurrent.Executors) Serializable(java.io.Serializable) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) MiniDFSCluster(org.apache.hadoop.hdfs.MiniDFSCluster) HoodieBackedTableMetadataWriter(org.apache.hudi.metadata.HoodieBackedTableMetadataWriter) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) TimelineService(org.apache.hudi.timeline.service.TimelineService) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ExecutorService(java.util.concurrent.ExecutorService) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HdfsTestService(org.apache.hudi.common.testutils.minicluster.HdfsTestService) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) SQLContext(org.apache.spark.sql.SQLContext) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) IOException(java.io.IOException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) Assertions(org.junit.jupiter.api.Assertions) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 3 with HoodieTableMetadata

use of org.apache.hudi.metadata.HoodieTableMetadata in project hudi by apache.

the class TestHoodieBackedMetadata method testMetadataTableWithPendingCompaction.

/**
 * Tests that virtual key configs are honored in base files after compaction in metadata table.
 */
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testMetadataTableWithPendingCompaction(boolean simulateFailedCompaction) throws Exception {
    HoodieTableType tableType = COPY_ON_WRITE;
    init(tableType, false);
    writeConfig = getWriteConfigBuilder(true, true, false).withMetadataConfig(HoodieMetadataConfig.newBuilder().enable(true).enableFullScan(true).enableMetrics(false).withMaxNumDeltaCommitsBeforeCompaction(3).build()).build();
    initWriteConfigAndMetatableWriter(writeConfig, true);
    doWriteOperation(testTable, "0000001", INSERT);
    // create an inflight compaction in metadata table.
    // not easy to create an inflight in metadata table directly, hence letting compaction succeed and then deleting the completed instant.
    // this new write is expected to trigger metadata table compaction
    String commitInstant = "0000002";
    doWriteOperation(testTable, commitInstant, INSERT);
    doWriteOperation(testTable, "0000003", INSERT);
    HoodieTableMetadata tableMetadata = metadata(writeConfig, context);
    String metadataCompactionInstant = commitInstant + "001";
    assertTrue(tableMetadata.getLatestCompactionTime().isPresent());
    assertEquals(tableMetadata.getLatestCompactionTime().get(), metadataCompactionInstant);
    validateMetadata(testTable);
    // Fetch compaction Commit file and rename to some other file. completed compaction meta file should have some serialized info that table interprets
    // for future upserts. so, renaming the file here to some temp name and later renaming it back to same name.
    java.nio.file.Path parentPath = Paths.get(metadataTableBasePath, HoodieTableMetaClient.METAFOLDER_NAME);
    java.nio.file.Path metaFilePath = parentPath.resolve(metadataCompactionInstant + HoodieTimeline.COMMIT_EXTENSION);
    java.nio.file.Path tempFilePath = FileCreateUtils.renameFileToTemp(metaFilePath, metadataCompactionInstant);
    metaClient.reloadActiveTimeline();
    testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter);
    // this validation will exercise the code path where a compaction is inflight in metadata table, but still metadata based file listing should match non
    // metadata based file listing.
    validateMetadata(testTable);
    if (simulateFailedCompaction) {
        // this should retry the compaction in metadata table.
        doWriteOperation(testTable, "0000004", INSERT);
    } else {
        // let the compaction succeed in metadata and validation should succeed.
        FileCreateUtils.renameTempToMetaFile(tempFilePath, metaFilePath);
    }
    validateMetadata(testTable);
    // add few more write and validate
    doWriteOperation(testTable, "0000005", INSERT);
    doWriteOperation(testTable, "0000006", UPSERT);
    validateMetadata(testTable);
    if (simulateFailedCompaction) {
        // trigger another compaction failure.
        metadataCompactionInstant = "0000005001";
        tableMetadata = metadata(writeConfig, context);
        assertTrue(tableMetadata.getLatestCompactionTime().isPresent());
        assertEquals(tableMetadata.getLatestCompactionTime().get(), metadataCompactionInstant);
        // Fetch compaction Commit file and rename to some other file. completed compaction meta file should have some serialized info that table interprets
        // for future upserts. so, renaming the file here to some temp name and later renaming it back to same name.
        parentPath = Paths.get(metadataTableBasePath, HoodieTableMetaClient.METAFOLDER_NAME);
        metaFilePath = parentPath.resolve(metadataCompactionInstant + HoodieTimeline.COMMIT_EXTENSION);
        tempFilePath = FileCreateUtils.renameFileToTemp(metaFilePath, metadataCompactionInstant);
        validateMetadata(testTable);
        // this should retry the failed compaction in metadata table.
        doWriteOperation(testTable, "0000007", INSERT);
        validateMetadata(testTable);
        // add few more write and validate
        doWriteOperation(testTable, "0000008", INSERT);
        doWriteOperation(testTable, "0000009", UPSERT);
        validateMetadata(testTable);
    }
}
Also used : HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 4 with HoodieTableMetadata

use of org.apache.hudi.metadata.HoodieTableMetadata in project hudi by apache.

the class HoodieTimelineArchiver method getInstantsToArchive.

private Stream<HoodieInstant> getInstantsToArchive() {
    Stream<HoodieInstant> instants = Stream.concat(getCleanInstantsToArchive(), getCommitInstantsToArchive());
    // For archiving and cleaning instants, we need to include intermediate state files if they exist
    HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false);
    Map<Pair<String, String>, List<HoodieInstant>> groupByTsAction = rawActiveTimeline.getInstants().collect(Collectors.groupingBy(i -> Pair.of(i.getTimestamp(), HoodieInstant.getComparableAction(i.getAction()))));
    // metadata table.
    if (config.isMetadataTableEnabled()) {
        try (HoodieTableMetadata tableMetadata = HoodieTableMetadata.create(table.getContext(), config.getMetadataConfig(), config.getBasePath(), FileSystemViewStorageConfig.SPILLABLE_DIR.defaultValue())) {
            Option<String> latestCompactionTime = tableMetadata.getLatestCompactionTime();
            if (!latestCompactionTime.isPresent()) {
                LOG.info("Not archiving as there is no compaction yet on the metadata table");
                instants = Stream.empty();
            } else {
                LOG.info("Limiting archiving of instants to latest compaction on metadata table at " + latestCompactionTime.get());
                instants = instants.filter(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.LESSER_THAN, latestCompactionTime.get()));
            }
        } catch (Exception e) {
            throw new HoodieException("Error limiting instant archival based on metadata table", e);
        }
    }
    return instants.flatMap(hoodieInstant -> groupByTsAction.get(Pair.of(hoodieInstant.getTimestamp(), HoodieInstant.getComparableAction(hoodieInstant.getAction()))).stream());
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieArchivedMetaEntry(org.apache.hudi.avro.model.HoodieArchivedMetaEntry) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) WriteMarkers(org.apache.hudi.table.marker.WriteMarkers) Schema(org.apache.avro.Schema) Collection(java.util.Collection) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) HoodieMergeArchiveFilePlan(org.apache.hudi.avro.model.HoodieMergeArchiveFilePlan) HoodieArchivedLogFile(org.apache.hudi.common.model.HoodieArchivedLogFile) LESSER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS) Collectors(java.util.stream.Collectors) FileNotFoundException(java.io.FileNotFoundException) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) CompactionTriggerStrategy(org.apache.hudi.table.action.compact.CompactionTriggerStrategy) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) MetadataConversionUtils(org.apache.hudi.client.utils.MetadataConversionUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) LESSER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) IOException(java.io.IOException) StorageSchemes(org.apache.hudi.common.fs.StorageSchemes) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) List(java.util.List) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) HoodieException(org.apache.hudi.exception.HoodieException) FileNotFoundException(java.io.FileNotFoundException) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Pair(org.apache.hudi.common.util.collection.Pair)

Example 5 with HoodieTableMetadata

use of org.apache.hudi.metadata.HoodieTableMetadata in project hudi by apache.

the class HoodieDataTableValidator method doDataTableValidation.

public void doDataTableValidation() {
    boolean finalResult = true;
    metaClient.reloadActiveTimeline();
    String basePath = metaClient.getBasePath();
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    try {
        HoodieTableMetadata tableMetadata = new FileSystemBackedTableMetadata(engineContext, engineContext.getHadoopConf(), cfg.basePath, cfg.assumeDatePartitioning);
        List<Path> allDataFilePaths = HoodieDataTableUtils.getBaseAndLogFilePathsFromFileSystem(tableMetadata, cfg.basePath);
        // verify that no data files present with commit time < earliest commit in active timeline.
        if (metaClient.getActiveTimeline().firstInstant().isPresent()) {
            String earliestInstant = metaClient.getActiveTimeline().firstInstant().get().getTimestamp();
            List<Path> danglingFilePaths = allDataFilePaths.stream().filter(path -> {
                String instantTime = FSUtils.getCommitTime(path.getName());
                return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN, earliestInstant);
            }).collect(Collectors.toList());
            if (!danglingFilePaths.isEmpty() && danglingFilePaths.size() > 0) {
                LOG.error("Data table validation failed due to dangling files count " + danglingFilePaths.size() + ", found before active timeline");
                danglingFilePaths.forEach(entry -> LOG.error("Dangling file: " + entry.toString()));
                finalResult = false;
                if (!cfg.ignoreFailed) {
                    throw new HoodieValidationException("Data table validation failed due to dangling files " + danglingFilePaths.size());
                }
            }
            // Verify that for every completed commit in active timeline, there are no extra files found apart from what is present in
            // commit metadata.
            Map<String, List<String>> instantToFilesMap = RepairUtils.tagInstantsOfBaseAndLogFiles(metaClient.getBasePath(), allDataFilePaths);
            HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
            List<HoodieInstant> hoodieInstants = activeTimeline.filterCompletedInstants().getInstants().collect(Collectors.toList());
            List<String> danglingFiles = engineContext.flatMap(hoodieInstants, instant -> {
                Option<Set<String>> filesFromTimeline = RepairUtils.getBaseAndLogFilePathsFromTimeline(activeTimeline, instant);
                List<String> baseAndLogFilesFromFs = instantToFilesMap.containsKey(instant.getTimestamp()) ? instantToFilesMap.get(instant.getTimestamp()) : Collections.emptyList();
                if (!baseAndLogFilesFromFs.isEmpty()) {
                    Set<String> danglingInstantFiles = new HashSet<>(baseAndLogFilesFromFs);
                    if (filesFromTimeline.isPresent()) {
                        danglingInstantFiles.removeAll(filesFromTimeline.get());
                    }
                    return new ArrayList<>(danglingInstantFiles).stream();
                } else {
                    return Stream.empty();
                }
            }, hoodieInstants.size()).stream().collect(Collectors.toList());
            if (!danglingFiles.isEmpty()) {
                LOG.error("Data table validation failed due to extra files found for completed commits " + danglingFiles.size());
                danglingFiles.forEach(entry -> LOG.error("Dangling file: " + entry.toString()));
                finalResult = false;
                if (!cfg.ignoreFailed) {
                    throw new HoodieValidationException("Data table validation failed due to dangling files " + danglingFiles.size());
                }
            }
        }
    } catch (Exception e) {
        LOG.error("Data table validation failed due to " + e.getMessage(), e);
        if (!cfg.ignoreFailed) {
            throw new HoodieValidationException("Data table validation failed due to " + e.getMessage(), e);
        }
    }
    if (finalResult) {
        LOG.info("Data table validation succeeded.");
    } else {
        LOG.warn("Data table validation failed.");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Parameter(com.beust.jcommander.Parameter) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) CompletableFuture(java.util.concurrent.CompletableFuture) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieAsyncService(org.apache.hudi.async.HoodieAsyncService) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ExecutorService(java.util.concurrent.ExecutorService) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) JCommander(com.beust.jcommander.JCommander) SparkConf(org.apache.spark.SparkConf) Set(java.util.Set) Collectors(java.util.stream.Collectors) Executors(java.util.concurrent.Executors) Serializable(java.io.Serializable) Objects(java.util.Objects) List(java.util.List) Stream(java.util.stream.Stream) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) RepairUtils(org.apache.hudi.table.repair.RepairUtils) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HashSet(java.util.HashSet) Set(java.util.Set) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) HoodieException(org.apache.hudi.exception.HoodieException) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) HoodieValidationException(org.apache.hudi.exception.HoodieValidationException) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet)

Aggregations

HoodieTableMetadata (org.apache.hudi.metadata.HoodieTableMetadata)8 HoodieTableType (org.apache.hudi.common.model.HoodieTableType)7 Path (org.apache.hadoop.fs.Path)6 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)6 ArrayList (java.util.ArrayList)5 List (java.util.List)5 Map (java.util.Map)5 Collectors (java.util.stream.Collectors)5 FSUtils (org.apache.hudi.common.fs.FSUtils)5 IOException (java.io.IOException)4 Arrays (java.util.Arrays)4 Collections (java.util.Collections)4 HashMap (java.util.HashMap)4 ExecutorService (java.util.concurrent.ExecutorService)4 Executors (java.util.concurrent.Executors)4 FileStatus (org.apache.hadoop.fs.FileStatus)4 HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)4 FileSlice (org.apache.hudi.common.model.FileSlice)4 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)3 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)3