Search in sources :

Example 6 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testSimpleInsertsGeneratedIntoLogFiles.

@Test
public void testSimpleInsertsGeneratedIntoLogFiles() throws Exception {
    // insert 100 records
    // Setting IndexType to be InMemory to simulate Global Index nature
    HoodieWriteConfig config = getConfigBuilder(false, HoodieIndex.IndexType.INMEMORY).build();
    Properties properties = new Properties();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        String newCommitTime = "100";
        writeClient.startCommitWithTime(newCommitTime);
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
        JavaRDD<HoodieRecord> recordsRDD = jsc().parallelize(records, 1);
        JavaRDD<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime);
        writeClient.commit(newCommitTime, statuses);
        HoodieTable table = HoodieSparkTable.create(config, context(), metaClient);
        table.getHoodieView().sync();
        TableFileSystemView.SliceView tableRTFileSystemView = table.getSliceView();
        long numLogFiles = 0;
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<FileSlice> allSlices = tableRTFileSystemView.getLatestFileSlices(partitionPath).collect(Collectors.toList());
            assertEquals(0, allSlices.stream().filter(fileSlice -> fileSlice.getBaseFile().isPresent()).count());
            assertTrue(allSlices.stream().anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0));
            long logFileCount = allSlices.stream().filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count();
            if (logFileCount > 0) {
                // check the log versions start from the base version
                assertTrue(allSlices.stream().map(slice -> slice.getLogFiles().findFirst().get().getLogVersion()).allMatch(version -> version.equals(HoodieLogFile.LOGFILE_BASE_VERSION)));
            }
            numLogFiles += logFileCount;
        }
        assertTrue(numLogFiles > 0);
        // Do a compaction
        String instantTime = writeClient.scheduleCompaction(Option.empty()).get().toString();
        HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = writeClient.compact(instantTime);
        String extension = table.getBaseFileExtension();
        Collection<List<HoodieWriteStat>> stats = compactionMetadata.getCommitMetadata().get().getPartitionToWriteStats().values();
        assertEquals(numLogFiles, stats.stream().flatMap(Collection::stream).filter(state -> state.getPath().contains(extension)).count());
        assertEquals(numLogFiles, stats.stream().mapToLong(Collection::size).sum());
        writeClient.commitCompaction(instantTime, compactionMetadata.getCommitMetadata().get(), Option.empty());
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Tag(org.junit.jupiter.api.Tag) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) MethodSource(org.junit.jupiter.params.provider.MethodSource) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Collection(java.util.Collection) Arguments(org.junit.jupiter.params.provider.Arguments) Collectors(java.util.stream.Collectors) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.jupiter.api.Test) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) Collection(java.util.Collection) List(java.util.List) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 7 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableRollback method testInsertsGeneratedIntoLogFilesRollback.

@ParameterizedTest
@ValueSource(booleans = { true, false })
void testInsertsGeneratedIntoLogFilesRollback(boolean rollbackUsingMarkers) throws Exception {
    Properties properties = new Properties();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
    // insert 100 records
    // Setting IndexType to be InMemory to simulate Global Index nature
    HoodieWriteConfig config = getConfigBuilder(false, rollbackUsingMarkers, HoodieIndex.IndexType.INMEMORY).build();
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        String newCommitTime = "100";
        writeClient.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
        JavaRDD<HoodieRecord> recordsRDD = jsc().parallelize(records, 1);
        // trigger an action
        List<WriteStatus> writeStatuses = ((JavaRDD<WriteStatus>) writeClient.insert(recordsRDD, newCommitTime)).collect();
        // Ensure that inserts are written to only log files
        assertEquals(0, writeStatuses.stream().filter(writeStatus -> !writeStatus.getStat().getPath().contains("log")).count());
        assertTrue(writeStatuses.stream().anyMatch(writeStatus -> writeStatus.getStat().getPath().contains("log")));
        // rollback a failed commit
        boolean rollback = writeClient.rollback(newCommitTime);
        assertTrue(rollback);
        // insert 100 records
        newCommitTime = "101";
        writeClient.startCommitWithTime(newCommitTime);
        records = dataGen.generateInserts(newCommitTime, 100);
        recordsRDD = jsc().parallelize(records, 1);
        writeClient.insert(recordsRDD, newCommitTime).collect();
        // Sleep for small interval (at least 1 second) to force a new rollback start time.
        Thread.sleep(1000);
        // We will test HUDI-204 here. We will simulate rollback happening twice by copying the commit file to local fs
        // and calling rollback twice
        final String lastCommitTime = newCommitTime;
        // Save the .commit file to local directory.
        // Rollback will be called twice to test the case where rollback failed first time and retried.
        // We got the "BaseCommitTime cannot be null" exception before the fix
        java.nio.file.Path tempFolder = Files.createTempDirectory(this.getClass().getCanonicalName());
        Map<String, String> fileNameMap = new HashMap<>();
        for (HoodieInstant.State state : Arrays.asList(HoodieInstant.State.REQUESTED, HoodieInstant.State.INFLIGHT)) {
            HoodieInstant toCopy = new HoodieInstant(state, HoodieTimeline.DELTA_COMMIT_ACTION, lastCommitTime);
            File file = Files.createTempFile(tempFolder, null, null).toFile();
            metaClient.getFs().copyToLocalFile(new Path(metaClient.getMetaPath(), toCopy.getFileName()), new Path(file.getAbsolutePath()));
            fileNameMap.put(file.getAbsolutePath(), toCopy.getFileName());
        }
        Path markerDir = new Path(Files.createTempDirectory(tempFolder, null).toAbsolutePath().toString());
        if (rollbackUsingMarkers) {
            metaClient.getFs().copyToLocalFile(new Path(metaClient.getMarkerFolderPath(lastCommitTime)), markerDir);
        }
        writeClient.rollback(newCommitTime);
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTable table = HoodieSparkTable.create(config, context());
        TableFileSystemView.SliceView tableRTFileSystemView = table.getSliceView();
        long numLogFiles = 0;
        for (String partitionPath : dataGen.getPartitionPaths()) {
            assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).noneMatch(fileSlice -> fileSlice.getBaseFile().isPresent()));
            assertTrue(tableRTFileSystemView.getLatestFileSlices(partitionPath).noneMatch(fileSlice -> fileSlice.getLogFiles().count() > 0));
            numLogFiles += tableRTFileSystemView.getLatestFileSlices(partitionPath).filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count();
        }
        assertEquals(0, numLogFiles);
        for (Map.Entry<String, String> entry : fileNameMap.entrySet()) {
            try {
                metaClient.getFs().copyFromLocalFile(new Path(entry.getKey()), new Path(metaClient.getMetaPath(), entry.getValue()));
            } catch (IOException e) {
                throw new HoodieIOException("Error copying state from local disk.", e);
            }
        }
        if (rollbackUsingMarkers) {
            metaClient.getFs().copyFromLocalFile(new Path(markerDir, lastCommitTime), new Path(metaClient.getMarkerFolderPath(lastCommitTime)));
        }
        Thread.sleep(1000);
        // Rollback again to pretend the first rollback failed partially. This should not error out
        writeClient.rollback(newCommitTime);
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) Assertions.assertAll(org.junit.jupiter.api.Assertions.assertAll) Tag(org.junit.jupiter.api.Tag) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) Collection(java.util.Collection) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MarkerType(org.apache.hudi.common.table.marker.MarkerType) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Files(java.nio.file.Files) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Properties(java.util.Properties) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) File(java.io.File) Map(java.util.Map) HashMap(java.util.HashMap) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 8 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableRollback method updateRecords.

private List<HoodieRecord> updateRecords(SparkRDDWriteClient client, HoodieTestDataGenerator dataGen, String commitTime, List<HoodieRecord> records, HoodieTableMetaClient metaClient, HoodieWriteConfig cfg, boolean assertLogFiles) throws IOException {
    client.startCommitWithTime(commitTime);
    records = dataGen.generateUpdates(commitTime, records);
    JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
    List<WriteStatus> statuses = client.upsert(writeRecords, commitTime).collect();
    assertNoWriteErrors(statuses);
    if (assertLogFiles) {
        HoodieTable table = HoodieSparkTable.create(cfg, context(), metaClient);
        table.getHoodieView().sync();
        TableFileSystemView.SliceView tableRTFileSystemView = table.getSliceView();
        long numLogFiles = 0;
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<FileSlice> allSlices = tableRTFileSystemView.getLatestFileSlices(partitionPath).collect(Collectors.toList());
            assertEquals(1, allSlices.stream().filter(fileSlice -> fileSlice.getBaseFile().isPresent()).count());
            assertTrue(allSlices.stream().anyMatch(fileSlice -> fileSlice.getLogFiles().count() > 0));
            numLogFiles += allSlices.stream().filter(fileSlice -> fileSlice.getLogFiles().count() > 0).count();
        }
        assertTrue(numLogFiles > 0);
    }
    return records;
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) Assertions.assertAll(org.junit.jupiter.api.Assertions.assertAll) Tag(org.junit.jupiter.api.Tag) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) Collection(java.util.Collection) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MarkerType(org.apache.hudi.common.table.marker.MarkerType) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Files(java.nio.file.Files) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Pair(org.apache.hudi.common.util.collection.Pair) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTable(org.apache.hudi.table.HoodieTable) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 9 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class CompactionTestBase method executeCompactionWithReplacedFiles.

protected void executeCompactionWithReplacedFiles(String compactionInstantTime, SparkRDDWriteClient client, HoodieTable table, HoodieWriteConfig cfg, String[] partitions, Set<HoodieFileGroupId> replacedFileIds) throws IOException {
    client.compact(compactionInstantTime);
    List<FileSlice> fileSliceList = getCurrentLatestFileSlices(table);
    assertTrue(fileSliceList.stream().findAny().isPresent(), "Ensure latest file-slices are not empty");
    assertFalse(fileSliceList.stream().anyMatch(fs -> replacedFileIds.contains(fs.getFileGroupId())), "Compacted files should not show up in latest slices");
    // verify that there is a commit
    table = getHoodieTable(HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).setLoadActiveTimelineOnLoad(true).build(), cfg);
    HoodieTimeline timeline = table.getMetaClient().getCommitTimeline().filterCompletedInstants();
    // verify compaction commit is visible in timeline
    assertTrue(timeline.filterCompletedInstants().getInstants().filter(instant -> compactionInstantTime.equals(instant.getTimestamp())).findFirst().isPresent());
    for (String partition : partitions) {
        table.getSliceView().getLatestFileSlicesBeforeOrOn(partition, compactionInstantTime, true).forEach(fs -> {
            // verify that all log files are merged
            assertEquals(0, fs.getLogFiles().count());
            assertTrue(fs.getBaseFile().isPresent());
        });
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) FileSystemViewStorageType(org.apache.hudi.common.table.view.FileSystemViewStorageType) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline)

Example 10 with FileSlice

use of org.apache.hudi.common.model.FileSlice in project hudi by apache.

the class TestHoodieCompactor method testWriteStatusContentsAfterCompaction.

@Test
public void testWriteStatusContentsAfterCompaction() throws Exception {
    // insert 100 records
    HoodieWriteConfig config = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).build()).build();
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        String newCommitTime = "100";
        writeClient.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
        JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
        writeClient.insert(recordsRDD, newCommitTime).collect();
        // Update all the 100 records
        HoodieTable table = HoodieSparkTable.create(config, context);
        newCommitTime = "101";
        List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
        JavaRDD<HoodieRecord> updatedRecordsRDD = jsc.parallelize(updatedRecords, 1);
        HoodieIndex index = new HoodieBloomIndex(config, SparkHoodieBloomIndexHelper.getInstance());
        JavaRDD<HoodieRecord> updatedTaggedRecordsRDD = tagLocation(index, updatedRecordsRDD, table);
        writeClient.startCommitWithTime(newCommitTime);
        writeClient.upsertPreppedRecords(updatedTaggedRecordsRDD, newCommitTime).collect();
        metaClient.reloadActiveTimeline();
        // Verify that all data file has one log file
        table = HoodieSparkTable.create(config, context);
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<FileSlice> groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
            for (FileSlice fileSlice : groupedLogFiles) {
                assertEquals(1, fileSlice.getLogFiles().count(), "There should be 1 log file written for every data file");
            }
        }
        // Do a compaction
        table = HoodieSparkTable.create(config, context);
        String compactionInstantTime = "102";
        table.scheduleCompaction(context, compactionInstantTime, Option.empty());
        table.getMetaClient().reloadActiveTimeline();
        HoodieData<WriteStatus> result = (HoodieData<WriteStatus>) table.compact(context, compactionInstantTime).getWriteStatuses();
        // Verify that all partition paths are present in the WriteStatus result
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<WriteStatus> writeStatuses = result.collectAsList();
            assertTrue(writeStatuses.stream().filter(writeStatus -> writeStatus.getStat().getPartitionPath().contentEquals(partitionPath)).count() > 0);
        }
    }
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) HoodieTable(org.apache.hudi.table.HoodieTable) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Option(org.apache.hudi.common.util.Option) SparkHoodieBloomIndexHelper(org.apache.hudi.index.bloom.SparkHoodieBloomIndexHelper) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieBloomIndex(org.apache.hudi.index.bloom.HoodieBloomIndex) HoodieClientTestHarness(org.apache.hudi.testutils.HoodieClientTestHarness) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) HoodieMemoryConfig(org.apache.hudi.config.HoodieMemoryConfig) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) Test(org.junit.jupiter.api.Test) WriteStatus(org.apache.hudi.client.WriteStatus) AfterEach(org.junit.jupiter.api.AfterEach) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) FSUtils(org.apache.hudi.common.fs.FSUtils) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieBloomIndex(org.apache.hudi.index.bloom.HoodieBloomIndex) HoodieTable(org.apache.hudi.table.HoodieTable) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test)

Aggregations

FileSlice (org.apache.hudi.common.model.FileSlice)87 List (java.util.List)51 ArrayList (java.util.ArrayList)45 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)45 Map (java.util.Map)44 Collectors (java.util.stream.Collectors)43 IOException (java.io.IOException)39 Path (org.apache.hadoop.fs.Path)39 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)39 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)38 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)38 Option (org.apache.hudi.common.util.Option)37 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)36 Pair (org.apache.hudi.common.util.collection.Pair)35 HashMap (java.util.HashMap)32 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)32 FSUtils (org.apache.hudi.common.fs.FSUtils)29 Stream (java.util.stream.Stream)28 Test (org.junit.jupiter.api.Test)27 HoodieFileGroup (org.apache.hudi.common.model.HoodieFileGroup)26