Search in sources :

Example 21 with HoodieFileGroup

use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.

the class AbstractTableFileSystemView method ensurePartitionLoadedCorrectly.

/**
 * Allows lazily loading the partitions if needed.
 *
 * @param partition partition to be loaded if not present
 */
private void ensurePartitionLoadedCorrectly(String partition) {
    ValidationUtils.checkArgument(!isClosed(), "View is already closed");
    // ensure we list files only once even in the face of concurrency
    addedPartitions.computeIfAbsent(partition, (partitionPathStr) -> {
        long beginTs = System.currentTimeMillis();
        if (!isPartitionAvailableInStore(partitionPathStr)) {
            // Not loaded yet
            try {
                LOG.info("Building file system view for partition (" + partitionPathStr + ")");
                Path partitionPath = FSUtils.getPartitionPath(metaClient.getBasePath(), partitionPathStr);
                long beginLsTs = System.currentTimeMillis();
                FileStatus[] statuses = listPartition(partitionPath);
                long endLsTs = System.currentTimeMillis();
                LOG.debug("#files found in partition (" + partitionPathStr + ") =" + statuses.length + ", Time taken =" + (endLsTs - beginLsTs));
                List<HoodieFileGroup> groups = addFilesToView(statuses);
                if (groups.isEmpty()) {
                    storePartitionView(partitionPathStr, new ArrayList<>());
                }
            } catch (IOException e) {
                throw new HoodieIOException("Failed to list base files in partition " + partitionPathStr, e);
            }
        } else {
            LOG.debug("View already built for Partition :" + partitionPathStr + ", FOUND is ");
        }
        long endTs = System.currentTimeMillis();
        LOG.debug("Time to load partition (" + partitionPathStr + ") =" + (endTs - beginTs));
        return true;
    });
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieIOException(org.apache.hudi.exception.HoodieIOException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 22 with HoodieFileGroup

use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.

the class TestHoodieBackedMetadata method validateMetadata.

private void validateMetadata(SparkRDDWriteClient testClient) throws IOException {
    HoodieWriteConfig config = testClient.getConfig();
    SparkRDDWriteClient client;
    if (config.isEmbeddedTimelineServerEnabled()) {
        testClient.close();
        client = new SparkRDDWriteClient(testClient.getEngineContext(), testClient.getConfig());
    } else {
        client = testClient;
    }
    HoodieTableMetadata tableMetadata = metadata(client);
    assertNotNull(tableMetadata, "MetadataReader should have been initialized");
    if (!config.isMetadataTableEnabled()) {
        return;
    }
    HoodieTimer timer = new HoodieTimer().startTimer();
    HoodieSparkEngineContext engineContext = new HoodieSparkEngineContext(jsc);
    // Partitions should match
    FileSystemBackedTableMetadata fsBackedTableMetadata = new FileSystemBackedTableMetadata(engineContext, new SerializableConfiguration(hadoopConf), config.getBasePath(), config.shouldAssumeDatePartitioning());
    List<String> fsPartitions = fsBackedTableMetadata.getAllPartitionPaths();
    List<String> metadataPartitions = tableMetadata.getAllPartitionPaths();
    Collections.sort(fsPartitions);
    Collections.sort(metadataPartitions);
    assertEquals(fsPartitions.size(), metadataPartitions.size(), "Partitions should match");
    assertTrue(fsPartitions.equals(metadataPartitions), "Partitions should match");
    // Files within each partition should match
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTable table = HoodieSparkTable.create(config, engineContext);
    TableFileSystemView tableView = table.getHoodieView();
    List<String> fullPartitionPaths = fsPartitions.stream().map(partition -> basePath + "/" + partition).collect(Collectors.toList());
    Map<String, FileStatus[]> partitionToFilesMap = tableMetadata.getAllFilesInPartitions(fullPartitionPaths);
    assertEquals(fsPartitions.size(), partitionToFilesMap.size());
    fsPartitions.forEach(partition -> {
        try {
            Path partitionPath;
            if (partition.equals("")) {
                // Should be the non-partitioned case
                partitionPath = new Path(basePath);
            } else {
                partitionPath = new Path(basePath, partition);
            }
            FileStatus[] fsStatuses = FSUtils.getAllDataFilesInPartition(fs, partitionPath);
            FileStatus[] metaStatuses = tableMetadata.getAllFilesInPartition(partitionPath);
            List<String> fsFileNames = Arrays.stream(fsStatuses).map(s -> s.getPath().getName()).collect(Collectors.toList());
            List<String> metadataFilenames = Arrays.stream(metaStatuses).map(s -> s.getPath().getName()).collect(Collectors.toList());
            Collections.sort(fsFileNames);
            Collections.sort(metadataFilenames);
            assertEquals(fsStatuses.length, partitionToFilesMap.get(partitionPath.toString()).length);
            // File sizes should be valid
            Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getLen() > 0));
            if ((fsFileNames.size() != metadataFilenames.size()) || (!fsFileNames.equals(metadataFilenames))) {
                LOG.info("*** File system listing = " + Arrays.toString(fsFileNames.toArray()));
                LOG.info("*** Metadata listing = " + Arrays.toString(metadataFilenames.toArray()));
                for (String fileName : fsFileNames) {
                    if (!metadataFilenames.contains(fileName)) {
                        LOG.error(partition + "FsFilename " + fileName + " not found in Meta data");
                    }
                }
                for (String fileName : metadataFilenames) {
                    if (!fsFileNames.contains(fileName)) {
                        LOG.error(partition + "Metadata file " + fileName + " not found in original FS");
                    }
                }
            }
            // Block sizes should be valid
            Arrays.stream(metaStatuses).forEach(s -> assertTrue(s.getBlockSize() > 0));
            List<Long> fsBlockSizes = Arrays.stream(fsStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList());
            Collections.sort(fsBlockSizes);
            List<Long> metadataBlockSizes = Arrays.stream(metaStatuses).map(FileStatus::getBlockSize).collect(Collectors.toList());
            Collections.sort(metadataBlockSizes);
            assertEquals(fsBlockSizes, metadataBlockSizes);
            assertEquals(fsFileNames.size(), metadataFilenames.size(), "Files within partition " + partition + " should match");
            assertTrue(fsFileNames.equals(metadataFilenames), "Files within partition " + partition + " should match");
            // FileSystemView should expose the same data
            List<HoodieFileGroup> fileGroups = tableView.getAllFileGroups(partition).collect(Collectors.toList());
            fileGroups.addAll(tableView.getAllReplacedFileGroups(partition).collect(Collectors.toList()));
            fileGroups.forEach(g -> LogManager.getLogger(TestHoodieBackedMetadata.class).info(g));
            fileGroups.forEach(g -> g.getAllBaseFiles().forEach(b -> LogManager.getLogger(TestHoodieBackedMetadata.class).info(b)));
            fileGroups.forEach(g -> g.getAllFileSlices().forEach(s -> LogManager.getLogger(TestHoodieBackedMetadata.class).info(s)));
            long numFiles = fileGroups.stream().mapToLong(g -> g.getAllBaseFiles().count() + g.getAllFileSlices().mapToLong(s -> s.getLogFiles().count()).sum()).sum();
            assertEquals(metadataFilenames.size(), numFiles);
        } catch (IOException e) {
            e.printStackTrace();
            assertTrue(false, "Exception should not be raised: " + e);
        }
    });
    HoodieBackedTableMetadataWriter metadataWriter = metadataWriter(client);
    assertNotNull(metadataWriter, "MetadataWriter should have been initialized");
    // Validate write config for metadata table
    HoodieWriteConfig metadataWriteConfig = metadataWriter.getWriteConfig();
    assertFalse(metadataWriteConfig.isMetadataTableEnabled(), "No metadata table for metadata table");
    // Metadata table should be in sync with the dataset
    HoodieTableMetaClient metadataMetaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(metadataTableBasePath).build();
    // Metadata table is MOR
    assertEquals(metadataMetaClient.getTableType(), HoodieTableType.MERGE_ON_READ, "Metadata Table should be MOR");
    // Metadata table is HFile format
    assertEquals(metadataMetaClient.getTableConfig().getBaseFileFormat(), HoodieFileFormat.HFILE, "Metadata Table base file format should be HFile");
    // Metadata table has a fixed number of partitions
    // Cannot use FSUtils.getAllFoldersWithPartitionMetaFile for this as that function filters all directory
    // in the .hoodie folder.
    List<String> metadataTablePartitions = FSUtils.getAllPartitionPaths(engineContext, HoodieTableMetadata.getMetadataTableBasePath(basePath), false, false);
    assertEquals(metadataWriter.getEnabledPartitionTypes().size(), metadataTablePartitions.size());
    final Map<String, MetadataPartitionType> metadataEnabledPartitionTypes = new HashMap<>();
    metadataWriter.getEnabledPartitionTypes().forEach(e -> metadataEnabledPartitionTypes.put(e.getPartitionPath(), e));
    // Metadata table should automatically compact and clean
    // versions are +1 as autoclean / compaction happens end of commits
    int numFileVersions = metadataWriteConfig.getCleanerFileVersionsRetained() + 1;
    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metadataMetaClient, metadataMetaClient.getActiveTimeline());
    metadataTablePartitions.forEach(partition -> {
        List<FileSlice> latestSlices = fsView.getLatestFileSlices(partition).collect(Collectors.toList());
        assertTrue(latestSlices.stream().map(FileSlice::getBaseFile).count() <= metadataEnabledPartitionTypes.get(partition).getFileGroupCount(), "Should have a single latest base file per file group");
        assertTrue(latestSlices.size() <= metadataEnabledPartitionTypes.get(partition).getFileGroupCount(), "Should have a single latest file slice per file group");
        assertTrue(latestSlices.size() <= (numFileVersions * metadataEnabledPartitionTypes.get(partition).getFileGroupCount()), "Should limit file slice to " + numFileVersions + " per file group, but was " + latestSlices.size());
    });
    LOG.info("Validation time=" + timer.endTimer());
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) Arrays(java.util.Arrays) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) FileStatus(org.apache.hadoop.fs.FileStatus) Disabled(org.junit.jupiter.api.Disabled) Collections.singletonList(java.util.Collections.singletonList) Future(java.util.concurrent.Future) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Arrays.asList(java.util.Arrays.asList) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) WriteConcurrencyMode(org.apache.hudi.common.model.WriteConcurrencyMode) Tag(org.junit.jupiter.api.Tag) FileSystemViewStorageType(org.apache.hudi.common.table.view.FileSystemViewStorageType) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Pair(org.apache.hadoop.hbase.util.Pair) Schema(org.apache.avro.Schema) Set(java.util.Set) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieIndex(org.apache.hudi.index.HoodieIndex) Executors(java.util.concurrent.Executors) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieMetadataMergedLogRecordReader(org.apache.hudi.metadata.HoodieMetadataMergedLogRecordReader) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieMetadataMetrics(org.apache.hudi.metadata.HoodieMetadataMetrics) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieBackedTableMetadataWriter(org.apache.hudi.metadata.HoodieBackedTableMetadataWriter) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) MetadataPartitionType(org.apache.hudi.metadata.MetadataPartitionType) DELETE(org.apache.hudi.common.model.WriteOperationType.DELETE) Registry(org.apache.hudi.common.metrics.Registry) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) ValueSource(org.junit.jupiter.params.provider.ValueSource) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) Files(java.nio.file.Files) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) MERGE_ON_READ(org.apache.hudi.common.model.HoodieTableType.MERGE_ON_READ) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieTableVersion(org.apache.hudi.common.table.HoodieTableVersion) INSERT(org.apache.hudi.common.model.WriteOperationType.INSERT) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieMetadataRecord(org.apache.hudi.avro.model.HoodieMetadataRecord) HoodieHFileReader(org.apache.hudi.io.storage.HoodieHFileReader) Paths(java.nio.file.Paths) HoodieKey(org.apache.hudi.common.model.HoodieKey) UPSERT(org.apache.hudi.common.model.WriteOperationType.UPSERT) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) COPY_ON_WRITE(org.apache.hudi.common.model.HoodieTableType.COPY_ON_WRITE) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) MethodSource(org.junit.jupiter.params.provider.MethodSource) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) LockConfiguration(org.apache.hudi.common.config.LockConfiguration) Collections.emptyList(java.util.Collections.emptyList) SparkUpgradeDowngradeHelper(org.apache.hudi.table.upgrade.SparkUpgradeDowngradeHelper) HoodieMetadataPayload(org.apache.hudi.metadata.HoodieMetadataPayload) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) CacheConfig(org.apache.hadoop.hbase.io.hfile.CacheConfig) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) InProcessLockProvider(org.apache.hudi.client.transaction.lock.InProcessLockProvider) FileSlice(org.apache.hudi.common.model.FileSlice) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) UpgradeDowngrade(org.apache.hudi.table.upgrade.UpgradeDowngrade) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) LinkedList(java.util.LinkedList) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) ExecutorService(java.util.concurrent.ExecutorService) GenericRecord(org.apache.avro.generic.GenericRecord) FILESYSTEM_LOCK_PATH_PROP_KEY(org.apache.hudi.common.config.LockConfiguration.FILESYSTEM_LOCK_PATH_PROP_KEY) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) HoodieLockConfig(org.apache.hudi.config.HoodieLockConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) Time(org.apache.hadoop.util.Time) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) MetadataPartitionType(org.apache.hudi.metadata.MetadataPartitionType) FileStatus(org.apache.hadoop.fs.FileStatus) HashMap(java.util.HashMap) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) IOException(java.io.IOException) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieBackedTableMetadataWriter(org.apache.hudi.metadata.HoodieBackedTableMetadataWriter) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) HoodieTable(org.apache.hudi.table.HoodieTable)

Example 23 with HoodieFileGroup

use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.

the class TestCleaner method testInsertAndCleanByCommits.

/**
 * Test Helper for Cleaning by versions logic from HoodieWriteClient API perspective.
 *
 * @param insertFn Insert API to be tested
 * @param upsertFn Upsert API to be tested
 * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
 *        record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
 * @throws Exception in case of errors
 */
private void testInsertAndCleanByCommits(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI) throws Exception {
    // keep upto 3 commits from the past
    int maxCommits = 3;
    HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()).withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).build();
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
    final Function2<List<HoodieRecord>, String, Integer> recordUpsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateUniqueUpdates);
    insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
    // Keep doing some writes and clean inline. Make sure we have expected number of files remaining.
    makeIncrementalCommitTimes(8).forEach(newCommitTime -> {
        try {
            client.startCommitWithTime(newCommitTime);
            List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newCommitTime, 100);
            List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newCommitTime).collect();
            // Verify there are no errors
            assertNoWriteErrors(statuses);
            metaClient = HoodieTableMetaClient.reload(metaClient);
            HoodieTable table1 = HoodieSparkTable.create(cfg, context, metaClient);
            HoodieTimeline activeTimeline = table1.getCompletedCommitsTimeline();
            // NOTE: See CleanPlanner#getFilesToCleanKeepingLatestCommits. We explicitly keep one commit before earliest
            // commit
            Option<HoodieInstant> earliestRetainedCommit = activeTimeline.nthFromLastInstant(maxCommits);
            Set<HoodieInstant> acceptableCommits = activeTimeline.getInstants().collect(Collectors.toSet());
            if (earliestRetainedCommit.isPresent()) {
                acceptableCommits.removeAll(activeTimeline.findInstantsInRange("000", earliestRetainedCommit.get().getTimestamp()).getInstants().collect(Collectors.toSet()));
                acceptableCommits.add(earliestRetainedCommit.get());
            }
            TableFileSystemView fsView = table1.getFileSystemView();
            // Need to ensure the following
            for (String partitionPath : dataGen.getPartitionPaths()) {
                List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
                for (HoodieFileGroup fileGroup : fileGroups) {
                    Set<String> commitTimes = new HashSet<>();
                    fileGroup.getAllBaseFiles().forEach(value -> {
                        LOG.debug("Data File - " + value);
                        commitTimes.add(value.getCommitTime());
                    });
                    assertEquals(acceptableCommits.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toSet()), commitTimes, "Only contain acceptable versions of file should be present");
                }
            }
        } catch (IOException ioe) {
            throw new RuntimeException(ioe);
        }
    });
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) ArrayList(java.util.ArrayList) List(java.util.List) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) WriteStatus(org.apache.hudi.client.WriteStatus) HashSet(java.util.HashSet)

Example 24 with HoodieFileGroup

use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.

the class TestCleaner method testInsertAndCleanByVersions.

/**
 * Test Helper for Cleaning by versions logic from HoodieWriteClient API perspective.
 *
 * @param insertFn Insert API to be tested
 * @param upsertFn Upsert API to be tested
 * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
 *        record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
 * @throws Exception in case of errors
 */
private void testInsertAndCleanByVersions(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI) throws Exception {
    // keep upto 2 versions for each file
    int maxVersions = 2;
    HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions).build()).withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
        final Function2<List<HoodieRecord>, String, Integer> recordUpsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateUniqueUpdates);
        insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS);
        Map<HoodieFileGroupId, FileSlice> compactionFileIdToLatestFileSlice = new HashMap<>();
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTable table = HoodieSparkTable.create(getConfig(), context, metaClient);
        for (String partitionPath : dataGen.getPartitionPaths()) {
            TableFileSystemView fsView = table.getFileSystemView();
            Option<Boolean> added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst().map(fg -> {
                fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs));
                return true;
            }));
            if (added.isPresent()) {
                // Select only one file-group for compaction
                break;
            }
        }
        // Create workload with selected file-slices
        List<Pair<String, FileSlice>> partitionFileSlicePairs = compactionFileIdToLatestFileSlice.entrySet().stream().map(e -> Pair.of(e.getKey().getPartitionPath(), e.getValue())).collect(Collectors.toList());
        HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicePairs, Option.empty(), Option.empty());
        List<String> instantTimes = makeIncrementalCommitTimes(9, 1, 10);
        String compactionTime = instantTimes.get(0);
        table.getActiveTimeline().saveToCompactionRequested(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionTime), TimelineMetadataUtils.serializeCompactionPlan(compactionPlan));
        instantTimes = instantTimes.subList(1, instantTimes.size());
        // remaining.
        for (String newInstantTime : instantTimes) {
            try {
                client.startCommitWithTime(newInstantTime);
                List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newInstantTime, 100);
                List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect();
                // Verify there are no errors
                assertNoWriteErrors(statuses);
                metaClient = HoodieTableMetaClient.reload(metaClient);
                table = HoodieSparkTable.create(getConfig(), context, metaClient);
                HoodieTimeline timeline = table.getMetaClient().getCommitsTimeline();
                TableFileSystemView fsView = table.getFileSystemView();
                // Need to ensure the following
                for (String partitionPath : dataGen.getPartitionPaths()) {
                    // compute all the versions of all files, from time 0
                    HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
                    for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
                        HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class);
                        for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
                            if (!fileIdToVersions.containsKey(wstat.getFileId())) {
                                fileIdToVersions.put(wstat.getFileId(), new TreeSet<>());
                            }
                            fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
                        }
                    }
                    List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
                    for (HoodieFileGroup fileGroup : fileGroups) {
                        if (compactionFileIdToLatestFileSlice.containsKey(fileGroup.getFileGroupId())) {
                            // Ensure latest file-slice selected for compaction is retained
                            Option<HoodieBaseFile> dataFileForCompactionPresent = Option.fromJavaOptional(fileGroup.getAllBaseFiles().filter(df -> {
                                return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()).getBaseInstantTime().equals(df.getCommitTime());
                            }).findAny());
                            assertTrue(dataFileForCompactionPresent.isPresent(), "Data File selected for compaction is retained");
                        } else {
                            // file has no more than max versions
                            String fileId = fileGroup.getFileGroupId().getFileId();
                            List<HoodieBaseFile> dataFiles = fileGroup.getAllBaseFiles().collect(Collectors.toList());
                            assertTrue(dataFiles.size() <= maxVersions, "fileId " + fileId + " has more than " + maxVersions + " versions");
                            // Each file, has the latest N versions (i.e cleaning gets rid of older versions)
                            List<String> commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId));
                            for (int i = 0; i < dataFiles.size(); i++) {
                                assertEquals((dataFiles.get(i)).getCommitTime(), commitedVersions.get(commitedVersions.size() - 1 - i), "File " + fileId + " does not have latest versions on commits" + commitedVersions);
                            }
                        }
                    }
                }
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        }
    }
}
Also used : Arrays(java.util.Arrays) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) CleanPlanner(org.apache.hudi.table.action.clean.CleanPlanner) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Awaitility.await(org.awaitility.Awaitility.await) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieIndex(org.apache.hudi.index.HoodieIndex) StandardCharsets(java.nio.charset.StandardCharsets) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) CleanPlanMigrator(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanMigrator) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) CleanPlanV1MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV1MigrationHandler) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) StringUtils(org.apache.hudi.common.util.StringUtils) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) HoodieTestCommitGenerator.getBaseFilename(org.apache.hudi.HoodieTestCommitGenerator.getBaseFilename) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) ValueSource(org.junit.jupiter.params.provider.ValueSource) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Files(java.nio.file.Files) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Paths(java.nio.file.Paths) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieCleanPartitionMetadata(org.apache.hudi.avro.model.HoodieCleanPartitionMetadata) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) HoodieTestTable.makeIncrementalCommitTimes(org.apache.hudi.common.testutils.HoodieTestTable.makeIncrementalCommitTimes) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) MethodSource(org.junit.jupiter.params.provider.MethodSource) IOType(org.apache.hudi.common.model.IOType) Predicate(java.util.function.Predicate) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Tuple3(scala.Tuple3) HoodieClusteringStrategy(org.apache.hudi.avro.model.HoodieClusteringStrategy) Test(org.junit.jupiter.api.Test) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) CleanMetadataMigrator(org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataMigrator) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) DEFAULT_PARTITION_PATHS(org.apache.hudi.common.testutils.HoodieTestUtils.DEFAULT_PARTITION_PATHS) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HashSet(java.util.HashSet) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) TestBootstrapIndex(org.apache.hudi.common.bootstrap.TestBootstrapIndex) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) TimeUnit(java.util.concurrent.TimeUnit) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieTestTable.makeNewCommitTime(org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkHoodieIndexFactory(org.apache.hudi.index.SparkHoodieIndexFactory) HoodieSliceInfo(org.apache.hudi.avro.model.HoodieSliceInfo) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HashMap(java.util.HashMap) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) WriteStatus(org.apache.hudi.client.WriteStatus) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 25 with HoodieFileGroup

use of org.apache.hudi.common.model.HoodieFileGroup in project hudi by apache.

the class HoodieClientRollbackTestBase method twoUpsertCommitDataWithTwoPartitions.

protected void twoUpsertCommitDataWithTwoPartitions(List<FileSlice> firstPartitionCommit2FileSlices, List<FileSlice> secondPartitionCommit2FileSlices, HoodieWriteConfig cfg, boolean commitSecondUpsert) throws IOException {
    // just generate two partitions
    dataGen = new HoodieTestDataGenerator(new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH });
    // 1. prepare data
    HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, new String[] { DEFAULT_FIRST_PARTITION_PATH, DEFAULT_SECOND_PARTITION_PATH }, basePath);
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    /**
     * Write 1 (only inserts)
     */
    String newCommitTime = "001";
    client.startCommitWithTime(newCommitTime);
    List<HoodieRecord> records = dataGen.generateInsertsContainsAllPartitions(newCommitTime, 2);
    JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
    JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime);
    Assertions.assertNoWriteErrors(statuses.collect());
    client.commit(newCommitTime, statuses);
    /**
     * Write 2 (updates)
     */
    newCommitTime = "002";
    client.startCommitWithTime(newCommitTime);
    records = dataGen.generateUpdates(newCommitTime, records);
    statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime);
    Assertions.assertNoWriteErrors(statuses.collect());
    if (commitSecondUpsert) {
        client.commit(newCommitTime, statuses);
    }
    // 2. assert file group and get the first partition file slice
    HoodieTable table = this.getHoodieTable(metaClient, cfg);
    SyncableFileSystemView fsView = getFileSystemViewWithUnCommittedSlices(table.getMetaClient());
    List<HoodieFileGroup> firstPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_FIRST_PARTITION_PATH).collect(Collectors.toList());
    assertEquals(1, firstPartitionCommit2FileGroups.size());
    firstPartitionCommit2FileSlices.addAll(firstPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
    // 3. assert file group and get the second partition file slice
    List<HoodieFileGroup> secondPartitionCommit2FileGroups = fsView.getAllFileGroups(DEFAULT_SECOND_PARTITION_PATH).collect(Collectors.toList());
    assertEquals(1, secondPartitionCommit2FileGroups.size());
    secondPartitionCommit2FileSlices.addAll(secondPartitionCommit2FileGroups.get(0).getAllFileSlices().collect(Collectors.toList()));
    // 4. assert file slice
    HoodieTableType tableType = this.getTableType();
    if (tableType.equals(HoodieTableType.COPY_ON_WRITE)) {
        assertEquals(2, firstPartitionCommit2FileSlices.size());
        assertEquals(2, secondPartitionCommit2FileSlices.size());
    } else {
        assertEquals(1, firstPartitionCommit2FileSlices.size());
        assertEquals(1, secondPartitionCommit2FileSlices.size());
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus)

Aggregations

HoodieFileGroup (org.apache.hudi.common.model.HoodieFileGroup)38 FileSlice (org.apache.hudi.common.model.FileSlice)29 IOException (java.io.IOException)27 Map (java.util.Map)27 List (java.util.List)26 Collectors (java.util.stream.Collectors)25 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)24 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)22 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)22 Option (org.apache.hudi.common.util.Option)22 Path (org.apache.hadoop.fs.Path)21 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)21 ArrayList (java.util.ArrayList)20 FileStatus (org.apache.hadoop.fs.FileStatus)19 Pair (org.apache.hudi.common.util.collection.Pair)19 LogManager (org.apache.log4j.LogManager)18 Logger (org.apache.log4j.Logger)18 Set (java.util.Set)17 Stream (java.util.stream.Stream)17 FSUtils (org.apache.hudi.common.fs.FSUtils)17