Search in sources :

Example 61 with HoodieLogFile

use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.

the class TestHoodieLogFormat method testBasicAppendAndTraverseInReverse.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testBasicAppendAndTraverseInReverse(boolean readBlocksLazily) throws IOException, URISyntaxException, InterruptedException {
    Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    Schema schema = getSimpleSchema();
    List<IndexedRecord> records1 = SchemaTestUtil.generateTestRecords(0, 100);
    List<IndexedRecord> copyOfRecords1 = records1.stream().map(record -> HoodieAvroUtils.rewriteRecord((GenericRecord) record, schema)).collect(Collectors.toList());
    Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
    header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
    header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
    HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records1, header);
    writer.appendBlock(dataBlock);
    writer.close();
    writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    List<IndexedRecord> records2 = SchemaTestUtil.generateTestRecords(0, 100);
    dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2, header);
    writer.appendBlock(dataBlock);
    writer.close();
    // Close and Open again and append 100 more records
    writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
    List<IndexedRecord> records3 = SchemaTestUtil.generateTestRecords(0, 100);
    dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records3, header);
    writer.appendBlock(dataBlock);
    writer.close();
    FileCreateUtils.createDeltaCommit(basePath, "100", fs);
    HoodieLogFileReader reader = new HoodieLogFileReader(fs, new HoodieLogFile(writer.getLogFile().getPath(), fs.getFileStatus(writer.getLogFile().getPath()).getLen()), SchemaTestUtil.getSimpleSchema(), bufferSize, readBlocksLazily, true);
    assertTrue(reader.hasPrev(), "Third block should be available");
    reader.moveToPrev();
    assertTrue(reader.hasPrev(), "Second block should be available");
    reader.moveToPrev();
    // After moving twice, this last reader.prev() should read the First block written
    assertTrue(reader.hasPrev(), "First block should be available");
    HoodieLogBlock prevBlock = reader.prev();
    HoodieDataBlock dataBlockRead = (HoodieDataBlock) prevBlock;
    List<IndexedRecord> recordsRead = getRecords(dataBlockRead);
    assertEquals(copyOfRecords1.size(), recordsRead.size(), "Read records size should be equal to the written records size");
    assertEquals(copyOfRecords1, recordsRead, "Both records lists should be the same. (ordering guaranteed)");
    assertFalse(reader.hasPrev());
    reader.close();
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieHFileDataBlock(org.apache.hudi.common.table.log.block.HoodieHFileDataBlock) FileSystem(org.apache.hadoop.fs.FileSystem) URISyntaxException(java.net.URISyntaxException) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) FileStatus(org.apache.hadoop.fs.FileStatus) AfterAll(org.junit.jupiter.api.AfterAll) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) BeforeAll(org.junit.jupiter.api.BeforeAll) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) Path(org.apache.hadoop.fs.Path) HoodieParquetDataBlock(org.apache.hudi.common.table.log.block.HoodieParquetDataBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) Collection(java.util.Collection) Compression(org.apache.hadoop.hbase.io.compress.Compression) Set(java.util.Set) HoodieArchivedLogFile(org.apache.hudi.common.model.HoodieArchivedLogFile) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) UncheckedIOException(java.io.UncheckedIOException) MiniClusterUtil(org.apache.hudi.common.testutils.minicluster.MiniClusterUtil) List(java.util.List) Stream(java.util.stream.Stream) HadoopMapRedUtils(org.apache.hudi.common.testutils.HadoopMapRedUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Option(org.apache.hudi.common.util.Option) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CorruptedLogFileException(org.apache.hudi.exception.CorruptedLogFileException) HashSet(java.util.HashSet) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) IndexedRecord(org.apache.avro.generic.IndexedRecord) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieLogBlockType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType) AppendResult(org.apache.hudi.common.table.log.AppendResult) IOException(java.io.IOException) HoodieLogFileReader(org.apache.hudi.common.table.log.HoodieLogFileReader) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) BenchmarkCounter(org.apache.parquet.hadoop.util.counters.BenchmarkCounter) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) HoodieLogFileReader(org.apache.hudi.common.table.log.HoodieLogFileReader) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 62 with HoodieLogFile

use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.

the class TestHoodieTableFileSystemView method testPendingCompactionWithDuplicateFileIdsAcrossPartitions.

@Test
public void testPendingCompactionWithDuplicateFileIdsAcrossPartitions() throws Exception {
    // Put some files in the partition
    String partitionPath1 = "2016/05/01";
    String partitionPath2 = "2016/05/02";
    String partitionPath3 = "2016/05/03";
    String fullPartitionPath1 = basePath + "/" + partitionPath1 + "/";
    new File(fullPartitionPath1).mkdirs();
    String fullPartitionPath2 = basePath + "/" + partitionPath2 + "/";
    new File(fullPartitionPath2).mkdirs();
    String fullPartitionPath3 = basePath + "/" + partitionPath3 + "/";
    new File(fullPartitionPath3).mkdirs();
    String instantTime1 = "1";
    String deltaInstantTime1 = "2";
    String deltaInstantTime2 = "3";
    String fileId = UUID.randomUUID().toString();
    String dataFileName = FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId);
    new File(fullPartitionPath1 + dataFileName).createNewFile();
    String fileName1 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN);
    new File(fullPartitionPath1 + fileName1).createNewFile();
    new File(fullPartitionPath2 + FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile();
    new File(fullPartitionPath2 + fileName1).createNewFile();
    new File(fullPartitionPath3 + FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile();
    new File(fullPartitionPath3 + fileName1).createNewFile();
    HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline();
    HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, instantTime1);
    HoodieInstant deltaInstant2 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime1);
    HoodieInstant deltaInstant3 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime2);
    saveAsComplete(commitTimeline, instant1, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant2, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant3, Option.empty());
    // Now we list all partitions
    FileStatus[] statuses = metaClient.getFs().listStatus(new Path[] { new Path(fullPartitionPath1), new Path(fullPartitionPath2), new Path(fullPartitionPath3) });
    assertEquals(6, statuses.length);
    refreshFsView();
    Arrays.asList(partitionPath1, partitionPath2, partitionPath3).forEach(p -> fsView.getAllFileGroups(p).count());
    List<HoodieFileGroup> groups = Stream.of(partitionPath1, partitionPath2, partitionPath3).flatMap(p -> fsView.getAllFileGroups(p)).collect(Collectors.toList());
    assertEquals(3, groups.size(), "Expected number of file-groups");
    assertEquals(3, groups.stream().map(HoodieFileGroup::getPartitionPath).collect(Collectors.toSet()).size(), "Partitions must be different for file-groups");
    Set<String> fileIds = groups.stream().map(HoodieFileGroup::getFileGroupId).map(HoodieFileGroupId::getFileId).collect(Collectors.toSet());
    assertEquals(1, fileIds.size(), "File Id must be same");
    assertTrue(fileIds.contains(fileId), "Expected FileId");
    // Setup Pending compaction for all of these fileIds.
    List<Pair<String, FileSlice>> partitionFileSlicesPairs = new ArrayList<>();
    List<FileSlice> fileSlices = rtView.getLatestFileSlices(partitionPath1).collect(Collectors.toList());
    partitionFileSlicesPairs.add(Pair.of(partitionPath1, fileSlices.get(0)));
    fileSlices = rtView.getLatestFileSlices(partitionPath2).collect(Collectors.toList());
    partitionFileSlicesPairs.add(Pair.of(partitionPath2, fileSlices.get(0)));
    fileSlices = rtView.getLatestFileSlices(partitionPath3).collect(Collectors.toList());
    partitionFileSlicesPairs.add(Pair.of(partitionPath3, fileSlices.get(0)));
    String compactionRequestedTime = "2";
    String compactDataFileName = FSUtils.makeDataFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId);
    HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicesPairs, Option.empty(), Option.empty());
    // Create a Data-file for some of the partitions but this should be skipped by view
    new File(basePath + "/" + partitionPath1 + "/" + compactDataFileName).createNewFile();
    new File(basePath + "/" + partitionPath2 + "/" + compactDataFileName).createNewFile();
    HoodieInstant compactionInstant = new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionRequestedTime);
    HoodieInstant requested = HoodieTimeline.getCompactionRequestedInstant(compactionInstant.getTimestamp());
    metaClient.getActiveTimeline().saveToCompactionRequested(requested, TimelineMetadataUtils.serializeCompactionPlan(compactionPlan));
    metaClient.getActiveTimeline().transitionCompactionRequestedToInflight(requested);
    // Fake delta-ingestion after compaction-requested
    String deltaInstantTime4 = "4";
    String deltaInstantTime5 = "6";
    String fileName3 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 0, TEST_WRITE_TOKEN);
    String fileName4 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 1, TEST_WRITE_TOKEN);
    new File(basePath + "/" + partitionPath1 + "/" + fileName3).createNewFile();
    new File(basePath + "/" + partitionPath1 + "/" + fileName4).createNewFile();
    new File(basePath + "/" + partitionPath2 + "/" + fileName3).createNewFile();
    new File(basePath + "/" + partitionPath2 + "/" + fileName4).createNewFile();
    new File(basePath + "/" + partitionPath3 + "/" + fileName3).createNewFile();
    new File(basePath + "/" + partitionPath3 + "/" + fileName4).createNewFile();
    HoodieInstant deltaInstant4 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime4);
    HoodieInstant deltaInstant5 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime5);
    saveAsComplete(commitTimeline, deltaInstant4, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant5, Option.empty());
    refreshFsView();
    // Test Data Files
    List<HoodieBaseFile> dataFiles = roView.getAllBaseFiles(partitionPath1).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "One data-file is expected as there is only one file-group");
    assertEquals("1", dataFiles.get(0).getCommitTime(), "Expect only valid commit");
    dataFiles = roView.getAllBaseFiles(partitionPath2).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "One data-file is expected as there is only one file-group");
    assertEquals("1", dataFiles.get(0).getCommitTime(), "Expect only valid commit");
    // Merge API Tests
    Arrays.asList(partitionPath1, partitionPath2, partitionPath3).forEach(partitionPath -> {
        List<FileSlice> fileSliceList = rtView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5).collect(Collectors.toList());
        assertEquals(1, fileSliceList.size(), "Expect file-slice to be merged");
        FileSlice fileSlice = fileSliceList.get(0);
        assertEquals(fileId, fileSlice.getFileId());
        assertEquals(dataFileName, fileSlice.getBaseFile().get().getFileName(), "Data file must be present");
        assertEquals(instantTime1, fileSlice.getBaseInstantTime(), "Base Instant of penultimate file-slice must be base instant");
        List<HoodieLogFile> logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
        assertEquals(3, logFiles.size(), "Log files must include those after compaction request");
        assertEquals(fileName4, logFiles.get(0).getFileName(), "Log File Order check");
        assertEquals(fileName3, logFiles.get(1).getFileName(), "Log File Order check");
        assertEquals(fileName1, logFiles.get(2).getFileName(), "Log File Order check");
        fileSliceList = rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, true).collect(Collectors.toList());
        assertEquals(1, fileSliceList.size(), "Expect only one file-id");
        fileSlice = fileSliceList.get(0);
        assertEquals(fileId, fileSlice.getFileId());
        assertFalse(fileSlice.getBaseFile().isPresent(), "No data-file expected in latest file-slice");
        assertEquals(compactionRequestedTime, fileSlice.getBaseInstantTime(), "Compaction requested instant must be base instant");
        logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
        assertEquals(2, logFiles.size(), "Log files must include only those after compaction request");
        assertEquals(fileName4, logFiles.get(0).getFileName(), "Log File Order check");
        assertEquals(fileName3, logFiles.get(1).getFileName(), "Log File Order check");
        // Check getLatestFileSlicesBeforeOrOn excluding fileIds in pending compaction
        fileSliceList = rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, false).collect(Collectors.toList());
        assertEquals(0, fileSliceList.size(), "Expect empty list as file-id is in pending compaction");
    });
    assertEquals(3, fsView.getPendingCompactionOperations().count());
    Set<String> partitionsInCompaction = fsView.getPendingCompactionOperations().map(Pair::getValue).map(CompactionOperation::getPartitionPath).collect(Collectors.toSet());
    assertEquals(3, partitionsInCompaction.size());
    assertTrue(partitionsInCompaction.contains(partitionPath1));
    assertTrue(partitionsInCompaction.contains(partitionPath2));
    assertTrue(partitionsInCompaction.contains(partitionPath3));
    Set<String> fileIdsInCompaction = fsView.getPendingCompactionOperations().map(Pair::getValue).map(CompactionOperation::getFileId).collect(Collectors.toSet());
    assertEquals(1, fileIdsInCompaction.size());
    assertTrue(fileIdsInCompaction.contains(fileId));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) HoodiePath(org.apache.hudi.avro.model.HoodiePath) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) Date(java.util.Date) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) BaseFile(org.apache.hudi.common.model.BaseFile) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodiePath(org.apache.hudi.avro.model.HoodiePath) HoodieFSPermission(org.apache.hudi.avro.model.HoodieFSPermission) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSlice(org.apache.hudi.common.model.FileSlice) IndexWriter(org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) CommitUtils(org.apache.hudi.common.util.CommitUtils) FsAction(org.apache.hadoop.fs.permission.FsAction) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 63 with HoodieLogFile

use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.

the class TestHoodieTableFileSystemView method testViewForFileSlicesWithAsyncCompaction.

/**
 * Helper method to test Views in the presence of concurrent compaction.
 *
 * @param skipCreatingDataFile if set, first File Slice will not have data-file set. This would simulate inserts going
 *        directly to log files
 * @param isCompactionInFlight if set, compaction was inflight (running) when view was tested first time, otherwise
 *        compaction was in requested state
 * @param expTotalFileSlices Total number of file-slices across file-groups in the partition path
 * @param expTotalDataFiles Total number of data-files across file-groups in the partition path
 * @param includeInvalidAndInflight Whether view includes inflight and invalid file-groups.
 * @param testBootstrap enable Bootstrap and test
 * @throws Exception -
 */
protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingDataFile, boolean isCompactionInFlight, int expTotalFileSlices, int expTotalDataFiles, boolean includeInvalidAndInflight, boolean testBootstrap) throws Exception {
    if (testBootstrap) {
        metaClient = HoodieTestUtils.init(tempDir.toAbsolutePath().toString(), getTableType(), BOOTSTRAP_SOURCE_PATH, testBootstrap);
    }
    String partitionPath = "2016/05/01";
    new File(basePath + "/" + partitionPath).mkdirs();
    String fileId = UUID.randomUUID().toString();
    String srcName = "part_0000" + metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
    HoodieFileStatus srcFileStatus = HoodieFileStatus.newBuilder().setPath(HoodiePath.newBuilder().setUri(BOOTSTRAP_SOURCE_PATH + partitionPath + "/" + srcName).build()).setLength(256 * 1024 * 1024L).setAccessTime(new Date().getTime()).setModificationTime(new Date().getTime() + 99999).setBlockReplication(2).setOwner("hudi").setGroup("hudi").setBlockSize(128 * 1024 * 1024L).setPermission(HoodieFSPermission.newBuilder().setUserAction(FsAction.ALL.name()).setGroupAction(FsAction.READ.name()).setOtherAction(FsAction.NONE.name()).setStickyBit(true).build()).build();
    // if skipCreatingDataFile, then instantTime1 below acts like delta-commit, otherwise it is base-commit
    String instantTime1 = testBootstrap && !skipCreatingDataFile ? HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS : "1";
    String deltaInstantTime1 = "2";
    String deltaInstantTime2 = "3";
    String dataFileName = null;
    if (!skipCreatingDataFile) {
        dataFileName = FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId);
        new File(basePath + "/" + partitionPath + "/" + dataFileName).createNewFile();
    }
    String fileName1 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN);
    String fileName2 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 1, TEST_WRITE_TOKEN);
    new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile();
    new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile();
    HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline();
    HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, instantTime1);
    HoodieInstant deltaInstant2 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime1);
    HoodieInstant deltaInstant3 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime2);
    if (testBootstrap && !skipCreatingDataFile) {
        try (IndexWriter writer = new HFileBootstrapIndex(metaClient).createWriter(BOOTSTRAP_SOURCE_PATH)) {
            writer.begin();
            BootstrapFileMapping mapping = new BootstrapFileMapping(BOOTSTRAP_SOURCE_PATH, partitionPath, partitionPath, srcFileStatus, fileId);
            List<BootstrapFileMapping> b = new ArrayList<>();
            b.add(mapping);
            writer.appendNextPartition(partitionPath, b);
            writer.finish();
        }
    }
    saveAsComplete(commitTimeline, instant1, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant2, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant3, Option.empty());
    refreshFsView();
    List<FileSlice> fileSlices = rtView.getLatestFileSlices(partitionPath).collect(Collectors.toList());
    assertEquals(1, fileSlices.size());
    FileSlice fileSlice = fileSlices.get(0);
    assertEquals(instantTime1, fileSlice.getBaseInstantTime());
    if (!skipCreatingDataFile) {
        assertTrue(fileSlice.getBaseFile().isPresent());
        checkExternalFile(srcFileStatus, fileSlice.getBaseFile().get().getBootstrapBaseFile(), testBootstrap);
    }
    String compactionRequestedTime = "4";
    String compactDataFileName = FSUtils.makeDataFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId);
    List<Pair<String, FileSlice>> partitionFileSlicesPairs = new ArrayList<>();
    partitionFileSlicesPairs.add(Pair.of(partitionPath, fileSlices.get(0)));
    HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicesPairs, Option.empty(), Option.empty());
    HoodieInstant compactionInstant;
    if (isCompactionInFlight) {
        // Create a Data-file but this should be skipped by view
        new File(basePath + "/" + partitionPath + "/" + compactDataFileName).createNewFile();
        compactionInstant = new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionRequestedTime);
        HoodieInstant requested = HoodieTimeline.getCompactionRequestedInstant(compactionInstant.getTimestamp());
        commitTimeline.saveToCompactionRequested(requested, TimelineMetadataUtils.serializeCompactionPlan(compactionPlan));
        commitTimeline.transitionCompactionRequestedToInflight(requested);
    } else {
        compactionInstant = new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionRequestedTime);
        commitTimeline.saveToCompactionRequested(compactionInstant, TimelineMetadataUtils.serializeCompactionPlan(compactionPlan));
    }
    // View immediately after scheduling compaction
    refreshFsView();
    List<FileSlice> slices = rtView.getLatestFileSlices(partitionPath).collect(Collectors.toList());
    assertEquals(1, slices.size(), "Expected latest file-slices");
    assertEquals(compactionRequestedTime, slices.get(0).getBaseInstantTime(), "Base-Instant must be compaction Instant");
    assertFalse(slices.get(0).getBaseFile().isPresent(), "Latest File Slice must not have data-file");
    assertEquals(0, slices.get(0).getLogFiles().count(), "Latest File Slice must not have any log-files");
    // Fake delta-ingestion after compaction-requested
    String deltaInstantTime4 = "5";
    String deltaInstantTime5 = "6";
    List<String> allInstantTimes = Arrays.asList(instantTime1, deltaInstantTime1, deltaInstantTime2, compactionRequestedTime, deltaInstantTime4, deltaInstantTime5);
    String fileName3 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 0, TEST_WRITE_TOKEN);
    String fileName4 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 1, TEST_WRITE_TOKEN);
    new File(basePath + "/" + partitionPath + "/" + fileName3).createNewFile();
    new File(basePath + "/" + partitionPath + "/" + fileName4).createNewFile();
    HoodieInstant deltaInstant4 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime4);
    HoodieInstant deltaInstant5 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime5);
    saveAsComplete(commitTimeline, deltaInstant4, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant5, Option.empty());
    refreshFsView();
    List<HoodieBaseFile> dataFiles = roView.getAllBaseFiles(partitionPath).collect(Collectors.toList());
    if (skipCreatingDataFile) {
        assertTrue(dataFiles.isEmpty(), "No data file expected");
    } else {
        assertEquals(1, dataFiles.size(), "One data-file is expected as there is only one file-group");
        assertEquals(dataFileName, dataFiles.get(0).getFileName(), "Expect only valid data-file");
    }
    // Merge API Tests
    List<FileSlice> fileSliceList = rtView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5).collect(Collectors.toList());
    assertEquals(1, fileSliceList.size(), "Expect file-slice to be merged");
    fileSlice = fileSliceList.get(0);
    assertEquals(fileId, fileSlice.getFileId());
    if (!skipCreatingDataFile) {
        assertEquals(dataFileName, fileSlice.getBaseFile().get().getFileName(), "Data file must be present");
        checkExternalFile(srcFileStatus, fileSlice.getBaseFile().get().getBootstrapBaseFile(), testBootstrap);
    } else {
        assertFalse(fileSlice.getBaseFile().isPresent(), "No data-file expected as it was not created");
    }
    assertEquals(instantTime1, fileSlice.getBaseInstantTime(), "Base Instant of penultimate file-slice must be base instant");
    List<HoodieLogFile> logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
    assertEquals(4, logFiles.size(), "Log files must include those after compaction request");
    assertEquals(fileName4, logFiles.get(0).getFileName(), "Log File Order check");
    assertEquals(fileName3, logFiles.get(1).getFileName(), "Log File Order check");
    assertEquals(fileName2, logFiles.get(2).getFileName(), "Log File Order check");
    assertEquals(fileName1, logFiles.get(3).getFileName(), "Log File Order check");
    fileSliceList = rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, true).collect(Collectors.toList());
    assertEquals(1, fileSliceList.size(), "Expect only one file-id");
    fileSlice = fileSliceList.get(0);
    assertEquals(fileId, fileSlice.getFileId());
    assertFalse(fileSlice.getBaseFile().isPresent(), "No data-file expected in latest file-slice");
    assertEquals(compactionRequestedTime, fileSlice.getBaseInstantTime(), "Compaction requested instant must be base instant");
    logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
    assertEquals(2, logFiles.size(), "Log files must include only those after compaction request");
    assertEquals(fileName4, logFiles.get(0).getFileName(), "Log File Order check");
    assertEquals(fileName3, logFiles.get(1).getFileName(), "Log File Order check");
    // Data Files API tests
    dataFiles = roView.getLatestBaseFiles().collect(Collectors.toList());
    if (skipCreatingDataFile) {
        assertEquals(0, dataFiles.size(), "Expect no data file to be returned");
    } else {
        assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
        dataFiles.forEach(df -> assertEquals(df.getCommitTime(), instantTime1, "Expect data-file for instant 1 be returned"));
        checkExternalFile(srcFileStatus, dataFiles.get(0).getBootstrapBaseFile(), testBootstrap);
    }
    dataFiles = roView.getLatestBaseFiles(partitionPath).collect(Collectors.toList());
    if (skipCreatingDataFile) {
        assertEquals(0, dataFiles.size(), "Expect no data file to be returned");
    } else {
        assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
        dataFiles.forEach(df -> assertEquals(df.getCommitTime(), instantTime1, "Expect data-file for instant 1 be returned"));
        checkExternalFile(srcFileStatus, dataFiles.get(0).getBootstrapBaseFile(), testBootstrap);
    }
    dataFiles = roView.getLatestBaseFilesBeforeOrOn(partitionPath, deltaInstantTime5).collect(Collectors.toList());
    if (skipCreatingDataFile) {
        assertEquals(0, dataFiles.size(), "Expect no data file to be returned");
    } else {
        assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
        dataFiles.forEach(df -> assertEquals(df.getCommitTime(), instantTime1, "Expect data-file for instant 1 be returned"));
        checkExternalFile(srcFileStatus, dataFiles.get(0).getBootstrapBaseFile(), testBootstrap);
    }
    dataFiles = roView.getLatestBaseFilesInRange(allInstantTimes).collect(Collectors.toList());
    if (skipCreatingDataFile) {
        assertEquals(0, dataFiles.size(), "Expect no data file to be returned");
    } else {
        assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
        dataFiles.forEach(df -> assertEquals(df.getCommitTime(), instantTime1, "Expect data-file for instant 1 be returned"));
        checkExternalFile(srcFileStatus, dataFiles.get(0).getBootstrapBaseFile(), testBootstrap);
    }
    // Inflight/Orphan File-groups needs to be in the view
    // There is a data-file with this inflight file-id
    final String inflightFileId1 = UUID.randomUUID().toString();
    // There is a log-file with this inflight file-id
    final String inflightFileId2 = UUID.randomUUID().toString();
    // There is an orphan data file with this file-id
    final String orphanFileId1 = UUID.randomUUID().toString();
    // There is an orphan log data file with this file-id
    final String orphanFileId2 = UUID.randomUUID().toString();
    final String invalidInstantId = "INVALIDTIME";
    String inflightDeltaInstantTime = "7";
    String orphanDataFileName = FSUtils.makeDataFileName(invalidInstantId, TEST_WRITE_TOKEN, orphanFileId1);
    new File(basePath + "/" + partitionPath + "/" + orphanDataFileName).createNewFile();
    String orphanLogFileName = FSUtils.makeLogFileName(orphanFileId2, HoodieLogFile.DELTA_EXTENSION, invalidInstantId, 0, TEST_WRITE_TOKEN);
    new File(basePath + "/" + partitionPath + "/" + orphanLogFileName).createNewFile();
    String inflightDataFileName = FSUtils.makeDataFileName(inflightDeltaInstantTime, TEST_WRITE_TOKEN, inflightFileId1);
    new File(basePath + "/" + partitionPath + "/" + inflightDataFileName).createNewFile();
    String inflightLogFileName = FSUtils.makeLogFileName(inflightFileId2, HoodieLogFile.DELTA_EXTENSION, inflightDeltaInstantTime, 0, TEST_WRITE_TOKEN);
    new File(basePath + "/" + partitionPath + "/" + inflightLogFileName).createNewFile();
    // Mark instant as inflight
    commitTimeline.createNewInstant(new HoodieInstant(State.REQUESTED, HoodieTimeline.DELTA_COMMIT_ACTION, inflightDeltaInstantTime));
    commitTimeline.transitionRequestedToInflight(new HoodieInstant(State.REQUESTED, HoodieTimeline.DELTA_COMMIT_ACTION, inflightDeltaInstantTime), Option.empty());
    refreshFsView();
    List<FileSlice> allRawFileSlices = getAllRawFileSlices(partitionPath).collect(Collectors.toList());
    dataFiles = allRawFileSlices.stream().flatMap(slice -> {
        if (slice.getBaseFile().isPresent()) {
            return Stream.of(slice.getBaseFile().get());
        }
        return Stream.empty();
    }).collect(Collectors.toList());
    if (includeInvalidAndInflight) {
        assertEquals(2 + (isCompactionInFlight ? 1 : 0) + (skipCreatingDataFile ? 0 : 1), dataFiles.size(), "Inflight/Orphan data-file is also expected");
        Set<String> fileNames = dataFiles.stream().map(HoodieBaseFile::getFileName).collect(Collectors.toSet());
        assertTrue(fileNames.contains(orphanDataFileName), "Expect orphan data-file to be present");
        assertTrue(fileNames.contains(inflightDataFileName), "Expect inflight data-file to be present");
        if (!skipCreatingDataFile) {
            assertTrue(fileNames.contains(dataFileName), "Expect old committed data-file");
        }
        if (isCompactionInFlight) {
            assertTrue(fileNames.contains(compactDataFileName), "Expect inflight compacted data file to be present");
        }
        fileSliceList = getLatestRawFileSlices(partitionPath).collect(Collectors.toList());
        assertEquals(includeInvalidAndInflight ? 5 : 1, fileSliceList.size(), "Expect both inflight and orphan file-slice to be included");
        Map<String, FileSlice> fileSliceMap = fileSliceList.stream().collect(Collectors.toMap(FileSlice::getFileId, r -> r));
        FileSlice orphanFileSliceWithDataFile = fileSliceMap.get(orphanFileId1);
        FileSlice orphanFileSliceWithLogFile = fileSliceMap.get(orphanFileId2);
        FileSlice inflightFileSliceWithDataFile = fileSliceMap.get(inflightFileId1);
        FileSlice inflightFileSliceWithLogFile = fileSliceMap.get(inflightFileId2);
        assertEquals(invalidInstantId, orphanFileSliceWithDataFile.getBaseInstantTime(), "Orphan File Slice with data-file check base-commit");
        assertEquals(orphanDataFileName, orphanFileSliceWithDataFile.getBaseFile().get().getFileName(), "Orphan File Slice with data-file check data-file");
        assertEquals(0, orphanFileSliceWithDataFile.getLogFiles().count(), "Orphan File Slice with data-file check data-file");
        assertEquals(inflightDeltaInstantTime, inflightFileSliceWithDataFile.getBaseInstantTime(), "Inflight File Slice with data-file check base-commit");
        assertEquals(inflightDataFileName, inflightFileSliceWithDataFile.getBaseFile().get().getFileName(), "Inflight File Slice with data-file check data-file");
        assertEquals(0, inflightFileSliceWithDataFile.getLogFiles().count(), "Inflight File Slice with data-file check data-file");
        assertEquals(invalidInstantId, orphanFileSliceWithLogFile.getBaseInstantTime(), "Orphan File Slice with log-file check base-commit");
        assertFalse(orphanFileSliceWithLogFile.getBaseFile().isPresent(), "Orphan File Slice with log-file check data-file");
        logFiles = orphanFileSliceWithLogFile.getLogFiles().collect(Collectors.toList());
        assertEquals(1, logFiles.size(), "Orphan File Slice with log-file check data-file");
        assertEquals(orphanLogFileName, logFiles.get(0).getFileName(), "Orphan File Slice with log-file check data-file");
        assertEquals(inflightDeltaInstantTime, inflightFileSliceWithLogFile.getBaseInstantTime(), "Inflight File Slice with log-file check base-commit");
        assertFalse(inflightFileSliceWithLogFile.getBaseFile().isPresent(), "Inflight File Slice with log-file check data-file");
        logFiles = inflightFileSliceWithLogFile.getLogFiles().collect(Collectors.toList());
        assertEquals(1, logFiles.size(), "Inflight File Slice with log-file check data-file");
        assertEquals(inflightLogFileName, logFiles.get(0).getFileName(), "Inflight File Slice with log-file check data-file");
    }
    compactionInstant = new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionRequestedTime);
    // Now simulate Compaction completing - Check the view
    if (!isCompactionInFlight) {
        // For inflight compaction, we already create a data-file to test concurrent inflight case.
        // If we skipped creating data file corresponding to compaction commit, create it now
        new File(basePath + "/" + partitionPath + "/" + compactDataFileName).createNewFile();
        commitTimeline.createNewInstant(compactionInstant);
    }
    commitTimeline.saveAsComplete(compactionInstant, Option.empty());
    refreshFsView();
    // populate the cache
    roView.getAllBaseFiles(partitionPath);
    fileSliceList = rtView.getLatestFileSlices(partitionPath).collect(Collectors.toList());
    LOG.info("FILESLICE LIST=" + fileSliceList);
    dataFiles = fileSliceList.stream().map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "Expect only one data-files in latest view as there is only one file-group");
    assertEquals(compactDataFileName, dataFiles.get(0).getFileName(), "Data Filename must match");
    assertEquals(1, fileSliceList.size(), "Only one latest file-slice in the partition");
    assertFalse(dataFiles.get(0).getBootstrapBaseFile().isPresent(), "No external data file must be present");
    fileSlice = fileSliceList.get(0);
    assertEquals(fileId, fileSlice.getFileId(), "Check file-Id is set correctly");
    assertEquals(compactDataFileName, fileSlice.getBaseFile().get().getFileName(), "Check data-filename is set correctly");
    assertEquals(compactionRequestedTime, fileSlice.getBaseInstantTime(), "Ensure base-instant is now compaction request instant");
    logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
    assertEquals(2, logFiles.size(), "Only log-files after compaction request shows up");
    assertEquals(fileName4, logFiles.get(0).getFileName(), "Log File Order check");
    assertEquals(fileName3, logFiles.get(1).getFileName(), "Log File Order check");
    // Data Files API tests
    dataFiles = roView.getLatestBaseFiles().collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
    assertFalse(dataFiles.get(0).getBootstrapBaseFile().isPresent(), "No external data file must be present");
    dataFiles.forEach(df -> {
        assertEquals(df.getCommitTime(), compactionRequestedTime, "Expect data-file created by compaction be returned");
        assertFalse(df.getBootstrapBaseFile().isPresent(), "No external data file must be present");
    });
    dataFiles = roView.getLatestBaseFiles(partitionPath).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
    dataFiles.forEach(df -> {
        assertEquals(df.getCommitTime(), compactionRequestedTime, "Expect data-file created by compaction be returned");
        assertFalse(df.getBootstrapBaseFile().isPresent(), "No external data file must be present");
    });
    dataFiles = roView.getLatestBaseFilesBeforeOrOn(partitionPath, deltaInstantTime5).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
    dataFiles.forEach(df -> {
        assertEquals(df.getCommitTime(), compactionRequestedTime, "Expect data-file created by compaction be returned");
        assertFalse(df.getBootstrapBaseFile().isPresent(), "No external data file must be present");
    });
    dataFiles = roView.getLatestBaseFilesInRange(allInstantTimes).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
    dataFiles.forEach(df -> {
        assertEquals(df.getCommitTime(), compactionRequestedTime, "Expect data-file created by compaction be returned");
        assertFalse(df.getBootstrapBaseFile().isPresent(), "No external data file must be present");
    });
    assertEquals(expTotalFileSlices, rtView.getAllFileSlices(partitionPath).count(), "Total number of file-slices in partitions matches expected");
    assertEquals(expTotalDataFiles, roView.getAllBaseFiles(partitionPath).count(), "Total number of data-files in partitions matches expected");
    // file-groups includes inflight/invalid file-ids
    assertEquals(5, fsView.getAllFileGroups(partitionPath).count(), "Total number of file-groups in partitions matches expected");
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) Date(java.util.Date) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) BaseFile(org.apache.hudi.common.model.BaseFile) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodiePath(org.apache.hudi.avro.model.HoodiePath) HoodieFSPermission(org.apache.hudi.avro.model.HoodieFSPermission) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSlice(org.apache.hudi.common.model.FileSlice) IndexWriter(org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) CommitUtils(org.apache.hudi.common.util.CommitUtils) FsAction(org.apache.hadoop.fs.permission.FsAction) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Date(java.util.Date) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) IndexWriter(org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter) Option(org.apache.hudi.common.util.Option) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File)

Example 64 with HoodieLogFile

use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.

the class TestHoodieTableFileSystemView method testStreamLatestVersionInPartition.

private void testStreamLatestVersionInPartition(boolean isLatestFileSliceOnly, String fullPartitionPath, String commitTime1, String commitTime2, String commitTime3, String commitTime4, String fileId1, String fileId2, String fileId3, String fileId4) throws IOException {
    // Now we list the entire partition
    FileStatus[] statuses = metaClient.getFs().listStatus(new Path(fullPartitionPath));
    assertEquals(11, statuses.length);
    refreshFsView();
    // Check files as of lastest commit.
    List<FileSlice> allSlices = rtView.getAllFileSlices("2016/05/01").collect(Collectors.toList());
    assertEquals(isLatestFileSliceOnly ? 4 : 8, allSlices.size());
    Map<String, Long> fileSliceMap = allSlices.stream().collect(Collectors.groupingBy(FileSlice::getFileId, Collectors.counting()));
    assertEquals(isLatestFileSliceOnly ? 1 : 2, fileSliceMap.get(fileId1).longValue());
    assertEquals(isLatestFileSliceOnly ? 1 : 3, fileSliceMap.get(fileId2).longValue());
    assertEquals(isLatestFileSliceOnly ? 1 : 2, fileSliceMap.get(fileId3).longValue());
    assertEquals(1, fileSliceMap.get(fileId4).longValue());
    List<HoodieBaseFile> dataFileList = roView.getLatestBaseFilesBeforeOrOn("2016/05/01", commitTime4).collect(Collectors.toList());
    assertEquals(3, dataFileList.size());
    Set<String> filenames = new HashSet<>();
    for (HoodieBaseFile status : dataFileList) {
        filenames.add(status.getFileName());
    }
    assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId1)));
    assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)));
    assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime4, TEST_WRITE_TOKEN, fileId3)));
    filenames = new HashSet<>();
    List<HoodieLogFile> logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime4, true).map(FileSlice::getLogFiles).flatMap(logFileList -> logFileList).collect(Collectors.toList());
    assertEquals(logFilesList.size(), 4);
    for (HoodieLogFile logFile : logFilesList) {
        filenames.add(logFile.getFileName());
    }
    assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)));
    assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId1, HoodieLogFile.DELTA_EXTENSION, commitTime4, 1, TEST_WRITE_TOKEN)));
    assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN)));
    assertTrue(filenames.contains(FSUtils.makeLogFileName(fileId4, HoodieLogFile.DELTA_EXTENSION, commitTime4, 0, TEST_WRITE_TOKEN)));
    // Reset the max commit time
    List<HoodieBaseFile> dataFiles = roView.getLatestBaseFilesBeforeOrOn("2016/05/01", commitTime3).collect(Collectors.toList());
    filenames = new HashSet<>();
    for (HoodieBaseFile status : dataFiles) {
        filenames.add(status.getFileName());
    }
    if (!isLatestFileSliceOnly) {
        assertEquals(3, dataFiles.size());
        assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime1, TEST_WRITE_TOKEN, fileId1)));
        assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)));
        assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId3)));
    } else {
        assertEquals(1, dataFiles.size());
        assertTrue(filenames.contains(FSUtils.makeDataFileName(commitTime3, TEST_WRITE_TOKEN, fileId2)));
    }
    logFilesList = rtView.getLatestFileSlicesBeforeOrOn("2016/05/01", commitTime3, true).map(FileSlice::getLogFiles).flatMap(logFileList -> logFileList).collect(Collectors.toList());
    assertEquals(logFilesList.size(), 1);
    assertEquals(logFilesList.get(0).getFileName(), FSUtils.makeLogFileName(fileId2, HoodieLogFile.DELTA_EXTENSION, commitTime3, 0, TEST_WRITE_TOKEN));
}
Also used : Path(org.apache.hadoop.fs.Path) HoodiePath(org.apache.hudi.avro.model.HoodiePath) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) Date(java.util.Date) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) BaseFile(org.apache.hudi.common.model.BaseFile) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodiePath(org.apache.hudi.avro.model.HoodiePath) HoodieFSPermission(org.apache.hudi.avro.model.HoodieFSPermission) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSlice(org.apache.hudi.common.model.FileSlice) IndexWriter(org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) CommitUtils(org.apache.hudi.common.util.CommitUtils) FsAction(org.apache.hadoop.fs.permission.FsAction) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HashSet(java.util.HashSet)

Example 65 with HoodieLogFile

use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.

the class HoodieBackedTableMetadata method getLogRecordScanner.

public Pair<HoodieMetadataMergedLogRecordReader, Long> getLogRecordScanner(List<HoodieLogFile> logFiles, String partitionName) {
    HoodieTimer timer = new HoodieTimer().startTimer();
    List<String> sortedLogFilePaths = logFiles.stream().sorted(HoodieLogFile.getLogFileComparator()).map(o -> o.getPath().toString()).collect(Collectors.toList());
    // Only those log files which have a corresponding completed instant on the dataset should be read
    // This is because the metadata table is updated before the dataset instants are committed.
    Set<String> validInstantTimestamps = getValidInstantTimestamps();
    Option<HoodieInstant> latestMetadataInstant = metadataMetaClient.getActiveTimeline().filterCompletedInstants().lastInstant();
    String latestMetadataInstantTime = latestMetadataInstant.map(HoodieInstant::getTimestamp).orElse(SOLO_COMMIT_TIMESTAMP);
    // Load the schema
    Schema schema = HoodieAvroUtils.addMetadataFields(HoodieMetadataRecord.getClassSchema());
    HoodieCommonConfig commonConfig = HoodieCommonConfig.newBuilder().fromProperties(metadataConfig.getProps()).build();
    HoodieMetadataMergedLogRecordReader logRecordScanner = HoodieMetadataMergedLogRecordReader.newBuilder().withFileSystem(metadataMetaClient.getFs()).withBasePath(metadataBasePath).withLogFilePaths(sortedLogFilePaths).withReaderSchema(schema).withLatestInstantTime(latestMetadataInstantTime).withMaxMemorySizeInBytes(MAX_MEMORY_SIZE_IN_BYTES).withBufferSize(BUFFER_SIZE).withSpillableMapBasePath(spillableMapDirectory).withDiskMapType(commonConfig.getSpillableDiskMapType()).withBitCaskDiskMapCompressionEnabled(commonConfig.isBitCaskDiskMapCompressionEnabled()).withLogBlockTimestamps(validInstantTimestamps).enableFullScan(metadataConfig.enableFullScan()).withPartition(partitionName).build();
    Long logScannerOpenMs = timer.endTimer();
    LOG.info(String.format("Opened %d metadata log files (dataset instant=%s, metadata instant=%s) in %d ms", sortedLogFilePaths.size(), getLatestDataInstantTime(), latestMetadataInstantTime, logScannerOpenMs));
    return Pair.of(logRecordScanner, logScannerOpenMs);
}
Also used : HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) LinkedList(java.util.LinkedList) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) TableNotFoundException(org.apache.hudi.exception.TableNotFoundException) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieMetadataRecord(org.apache.hudi.avro.model.HoodieMetadataRecord) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) HoodieCommonConfig(org.apache.hudi.common.config.HoodieCommonConfig) SpillableMapUtils(org.apache.hudi.common.util.SpillableMapUtils) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommonConfig(org.apache.hudi.common.config.HoodieCommonConfig) Schema(org.apache.avro.Schema) HoodieTimer(org.apache.hudi.common.util.HoodieTimer)

Aggregations

HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)77 IOException (java.io.IOException)48 List (java.util.List)46 Path (org.apache.hadoop.fs.Path)45 Map (java.util.Map)42 Collectors (java.util.stream.Collectors)42 ArrayList (java.util.ArrayList)38 Option (org.apache.hudi.common.util.Option)37 FileSlice (org.apache.hudi.common.model.FileSlice)34 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)29 FileStatus (org.apache.hadoop.fs.FileStatus)28 HashMap (java.util.HashMap)26 FSUtils (org.apache.hudi.common.fs.FSUtils)26 Pair (org.apache.hudi.common.util.collection.Pair)25 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)24 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)23 Set (java.util.Set)22 LogManager (org.apache.log4j.LogManager)22 Logger (org.apache.log4j.Logger)22 HoodieLogFormat (org.apache.hudi.common.table.log.HoodieLogFormat)21