Search in sources :

Example 16 with HoodieMergedLogRecordScanner

use of org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner in project hudi by apache.

the class TestHoodieLogFormat method testAvroLogRecordReaderMergingMultipleLogFiles.

/*
   * During a spark stage failure, when the stage is retried, tasks that are part of the previous attempt
   * of the stage would continue to run.  As a result two different tasks could be performing the same operation.
   * When trying to update the log file, only one of the tasks would succeed (one holding lease on the log file).
   *
   * In order to make progress in this scenario, second task attempting to update the log file would rollover to
   * a new version of the log file.  As a result, we might end up with two log files with same set of data records
   * present in both of them.
   *
   * Following uint tests mimic this scenario to ensure that the reader can handle merging multiple log files with
   * duplicate data.
   *
   */
private void testAvroLogRecordReaderMergingMultipleLogFiles(int numRecordsInLog1, int numRecordsInLog2, ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled, boolean readBlocksLazily) {
    try {
        // Write one Data block with same InstantTime (written in same batch)
        Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema());
        List<IndexedRecord> records = SchemaTestUtil.generateHoodieTestRecords(0, 101);
        List<IndexedRecord> records2 = new ArrayList<>(records);
        // Write1 with numRecordsInLog1 records written to log.1
        Writer writer = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).build();
        Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
        header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
        header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
        HoodieDataBlock dataBlock = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records.subList(0, numRecordsInLog1), header);
        writer.appendBlock(dataBlock);
        // Get the size of the block
        long size = writer.getCurrentSize();
        writer.close();
        // write2 with numRecordsInLog2 records written to log.2
        Writer writer2 = HoodieLogFormat.newWriterBuilder().onParentPath(partitionPath).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-fileid1").overBaseCommit("100").withFs(fs).withSizeThreshold(size - 1).build();
        Map<HoodieLogBlock.HeaderMetadataType, String> header2 = new HashMap<>();
        header2.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, "100");
        header2.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
        HoodieDataBlock dataBlock2 = getDataBlock(DEFAULT_DATA_BLOCK_TYPE, records2.subList(0, numRecordsInLog2), header2);
        writer2.appendBlock(dataBlock2);
        // Get the size of the block
        writer2.close();
        FileCreateUtils.createDeltaCommit(basePath, "100", fs);
        // From the two log files generated, read the records
        List<String> allLogFiles = FSUtils.getAllLogFiles(fs, partitionPath, "test-fileid1", HoodieLogFile.DELTA_EXTENSION, "100").map(s -> s.getPath().toString()).collect(Collectors.toList());
        HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(basePath).withLogFilePaths(allLogFiles).withReaderSchema(schema).withLatestInstantTime("100").withMaxMemorySizeInBytes(10240L).withReadBlocksLazily(readBlocksLazily).withReverseReader(false).withBufferSize(bufferSize).withSpillableMapBasePath(BASE_OUTPUT_PATH).withDiskMapType(diskMapType).withBitCaskDiskMapCompressionEnabled(isCompressionEnabled).build();
        assertEquals(Math.max(numRecordsInLog1, numRecordsInLog2), scanner.getNumMergedRecordsInLog(), "We would read 100 records");
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieHFileDataBlock(org.apache.hudi.common.table.log.block.HoodieHFileDataBlock) FileSystem(org.apache.hadoop.fs.FileSystem) URISyntaxException(java.net.URISyntaxException) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) FileStatus(org.apache.hadoop.fs.FileStatus) AfterAll(org.junit.jupiter.api.AfterAll) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) BeforeAll(org.junit.jupiter.api.BeforeAll) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) Path(org.apache.hadoop.fs.Path) HoodieParquetDataBlock(org.apache.hudi.common.table.log.block.HoodieParquetDataBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) MethodSource(org.junit.jupiter.params.provider.MethodSource) Schema(org.apache.avro.Schema) Collection(java.util.Collection) Compression(org.apache.hadoop.hbase.io.compress.Compression) Set(java.util.Set) HoodieArchivedLogFile(org.apache.hudi.common.model.HoodieArchivedLogFile) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) UncheckedIOException(java.io.UncheckedIOException) MiniClusterUtil(org.apache.hudi.common.testutils.minicluster.MiniClusterUtil) List(java.util.List) Stream(java.util.stream.Stream) HadoopMapRedUtils(org.apache.hudi.common.testutils.HadoopMapRedUtils) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Option(org.apache.hudi.common.util.Option) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CorruptedLogFileException(org.apache.hudi.exception.CorruptedLogFileException) HashSet(java.util.HashSet) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) IndexedRecord(org.apache.avro.generic.IndexedRecord) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieLogBlockType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType) AppendResult(org.apache.hudi.common.table.log.AppendResult) IOException(java.io.IOException) HoodieLogFileReader(org.apache.hudi.common.table.log.HoodieLogFileReader) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) BenchmarkCounter(org.apache.parquet.hadoop.util.counters.BenchmarkCounter) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) ArrayList(java.util.ArrayList) URISyntaxException(java.net.URISyntaxException) UncheckedIOException(java.io.UncheckedIOException) CorruptedLogFileException(org.apache.hudi.exception.CorruptedLogFileException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer)

Example 17 with HoodieMergedLogRecordScanner

use of org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner in project hudi by apache.

the class HoodieLogFileCommand method showLogFileRecords.

@CliCommand(value = "show logfile records", help = "Read records from log files")
public String showLogFileRecords(@CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit, @CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files") final String logFilePathPattern, @CliOption(key = "mergeRecords", help = "If the records in the log files should be merged", unspecifiedDefaultValue = "false") final Boolean shouldMerge) throws IOException {
    System.out.println("===============> Showing only " + limit + " records <===============");
    HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
    FileSystem fs = client.getFs();
    List<String> logFilePaths = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(logFilePathPattern)).stream().map(status -> status.getPath().toString()).sorted(Comparator.reverseOrder()).collect(Collectors.toList());
    // logFilePaths size must > 1
    assert logFilePaths.size() > 0 : "There is no log file";
    // TODO : readerSchema can change across blocks/log files, fix this inside Scanner
    AvroSchemaConverter converter = new AvroSchemaConverter();
    // get schema from last log file
    Schema readerSchema = converter.convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1)))));
    List<IndexedRecord> allRecords = new ArrayList<>();
    if (shouldMerge) {
        System.out.println("===========================> MERGING RECORDS <===================");
        HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(client.getBasePath()).withLogFilePaths(logFilePaths).withReaderSchema(readerSchema).withLatestInstantTime(client.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp()).withReadBlocksLazily(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE.defaultValue())).withReverseReader(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue())).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
        for (HoodieRecord<? extends HoodieRecordPayload> hoodieRecord : scanner) {
            Option<IndexedRecord> record = hoodieRecord.getData().getInsertValue(readerSchema);
            if (allRecords.size() < limit) {
                allRecords.add(record.get());
            }
        }
    } else {
        for (String logFile : logFilePaths) {
            Schema writerSchema = new AvroSchemaConverter().convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(client.getFs(), new Path(logFile))));
            HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema);
            // read the avro blocks
            while (reader.hasNext()) {
                HoodieLogBlock n = reader.next();
                if (n instanceof HoodieDataBlock) {
                    HoodieDataBlock blk = (HoodieDataBlock) n;
                    try (ClosableIterator<IndexedRecord> recordItr = blk.getRecordItr()) {
                        recordItr.forEachRemaining(record -> {
                            if (allRecords.size() < limit) {
                                allRecords.add(record);
                            }
                        });
                    }
                }
            }
            reader.close();
            if (allRecords.size() >= limit) {
                break;
            }
        }
    }
    String[][] rows = new String[allRecords.size()][];
    int i = 0;
    for (IndexedRecord record : allRecords) {
        String[] data = new String[1];
        data[0] = record.toString();
        rows[i] = data;
        i++;
    }
    return HoodiePrintHelper.print(new String[] { HoodieTableHeaderFields.HEADER_RECORDS }, rows);
}
Also used : Path(org.apache.hadoop.fs.Path) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 18 with HoodieMergedLogRecordScanner

use of org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner in project hudi by apache.

the class TestHoodieLogFileCommand method testShowLogFileRecordsWithMerge.

/**
 * Test case for 'show logfile records' with merge.
 */
@Test
public void testShowLogFileRecordsWithMerge() throws IOException, InterruptedException, URISyntaxException {
    // create commit instant
    HoodieTestCommitMetadataGenerator.createCommitFile(tablePath, INSTANT_TIME, HoodieCLI.conf);
    // write to path '2015/03/16'.
    Schema schema = HoodieAvroUtils.addMetadataFields(getSimpleSchema());
    partitionPath = tablePath + Path.SEPARATOR + HoodieTestCommitMetadataGenerator.DEFAULT_SECOND_PARTITION_PATH;
    Files.createDirectories(Paths.get(partitionPath));
    HoodieLogFormat.Writer writer = null;
    try {
        // set little threshold to split file.
        writer = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId("test-log-fileid1").overBaseCommit(INSTANT_TIME).withFs(fs).withSizeThreshold(500).build();
        List<IndexedRecord> records1 = SchemaTestUtil.generateHoodieTestRecords(0, 100);
        Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
        header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, INSTANT_TIME);
        header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
        HoodieAvroDataBlock dataBlock = new HoodieAvroDataBlock(records1, header, HoodieRecord.RECORD_KEY_METADATA_FIELD);
        writer.appendBlock(dataBlock);
    } finally {
        if (writer != null) {
            writer.close();
        }
    }
    CommandResult cr = shell().executeCommand("show logfile records --logFilePathPattern " + partitionPath + "/* --mergeRecords true");
    assertTrue(cr.isSuccess());
    // get expected result of 10 records.
    List<String> logFilePaths = Arrays.stream(fs.globStatus(new Path(partitionPath + "/*"))).map(status -> status.getPath().toString()).collect(Collectors.toList());
    HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(tablePath).withLogFilePaths(logFilePaths).withReaderSchema(schema).withLatestInstantTime(INSTANT_TIME).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withReadBlocksLazily(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE.defaultValue())).withReverseReader(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue())).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
    Iterator<HoodieRecord<? extends HoodieRecordPayload>> records = scanner.iterator();
    int num = 0;
    int maxSize = 10;
    List<IndexedRecord> indexRecords = new ArrayList<>();
    while (records.hasNext() && num < maxSize) {
        Option<IndexedRecord> hoodieRecord = records.next().getData().getInsertValue(schema);
        indexRecords.add(hoodieRecord.get());
        num++;
    }
    String[][] rows = indexRecords.stream().map(r -> new String[] { r.toString() }).toArray(String[][]::new);
    assertNotNull(rows);
    String expected = HoodiePrintHelper.print(new String[] { HoodieTableHeaderFields.HEADER_RECORDS }, rows);
    expected = removeNonWordAndStripSpace(expected);
    String got = removeNonWordAndStripSpace(cr.getResult().toString());
    assertEquals(expected, got);
}
Also used : Path(org.apache.hadoop.fs.Path) BeforeEach(org.junit.jupiter.api.BeforeEach) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) Arrays(java.util.Arrays) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSystem(org.apache.hadoop.fs.FileSystem) URISyntaxException(java.net.URISyntaxException) HoodieTableHeaderFields(org.apache.hudi.cli.HoodieTableHeaderFields) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Map(java.util.Map) HoodieMemoryConfig(org.apache.hudi.config.HoodieMemoryConfig) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Tag(org.junit.jupiter.api.Tag) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) CLIFunctionalTestHarness(org.apache.hudi.cli.functional.CLIFunctionalTestHarness) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) IndexedRecord(org.apache.avro.generic.IndexedRecord) TimelineLayoutVersion(org.apache.hudi.common.table.timeline.versioning.TimelineLayoutVersion) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) Iterator(java.util.Iterator) Files(java.nio.file.Files) TableHeader(org.apache.hudi.cli.TableHeader) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) HoodieTestCommitMetadataGenerator(org.apache.hudi.cli.testutils.HoodieTestCommitMetadataGenerator) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieCLI(org.apache.hudi.cli.HoodieCLI) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) Test(org.junit.jupiter.api.Test) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) AfterEach(org.junit.jupiter.api.AfterEach) List(java.util.List) Paths(java.nio.file.Paths) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) CommandResult(org.springframework.shell.core.CommandResult) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) HoodieCommonConfig(org.apache.hudi.common.config.HoodieCommonConfig) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) FSUtils(org.apache.hudi.common.fs.FSUtils) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) SchemaTestUtil.getSimpleSchema(org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema) Schema(org.apache.avro.Schema) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) ArrayList(java.util.ArrayList) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) CommandResult(org.springframework.shell.core.CommandResult) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) Test(org.junit.jupiter.api.Test)

Example 19 with HoodieMergedLogRecordScanner

use of org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner in project hudi by apache.

the class MergeOnReadInputFormat method getLogFileIterator.

private ClosableIterator<RowData> getLogFileIterator(MergeOnReadInputSplit split) {
    final Schema tableSchema = new Schema.Parser().parse(tableState.getAvroSchema());
    final Schema requiredSchema = new Schema.Parser().parse(tableState.getRequiredAvroSchema());
    final GenericRecordBuilder recordBuilder = new GenericRecordBuilder(requiredSchema);
    final AvroToRowDataConverters.AvroToRowDataConverter avroToRowDataConverter = AvroToRowDataConverters.createRowConverter(tableState.getRequiredRowType());
    final HoodieMergedLogRecordScanner scanner = FormatUtils.logScanner(split, tableSchema, hadoopConf, conf.getBoolean(FlinkOptions.CHANGELOG_ENABLED));
    final Iterator<String> logRecordsKeyIterator = scanner.getRecords().keySet().iterator();
    final int[] pkOffset = tableState.getPkOffsetsInRequired();
    // flag saying whether the pk semantics has been dropped by user specified
    // projections. For e.g, if the pk fields are [a, b] but user only select a,
    // then the pk semantics is lost.
    final boolean pkSemanticLost = Arrays.stream(pkOffset).anyMatch(offset -> offset == -1);
    final LogicalType[] pkTypes = pkSemanticLost ? null : tableState.getPkTypes(pkOffset);
    final StringToRowDataConverter converter = pkSemanticLost ? null : new StringToRowDataConverter(pkTypes);
    return new ClosableIterator<RowData>() {

        private RowData currentRecord;

        @Override
        public boolean hasNext() {
            while (logRecordsKeyIterator.hasNext()) {
                String curAvroKey = logRecordsKeyIterator.next();
                Option<IndexedRecord> curAvroRecord = null;
                final HoodieAvroRecord<?> hoodieRecord = (HoodieAvroRecord) scanner.getRecords().get(curAvroKey);
                try {
                    curAvroRecord = hoodieRecord.getData().getInsertValue(tableSchema);
                } catch (IOException e) {
                    throw new HoodieException("Get avro insert value error for key: " + curAvroKey, e);
                }
                if (!curAvroRecord.isPresent()) {
                    // delete record found
                    if (emitDelete && !pkSemanticLost) {
                        GenericRowData delete = new GenericRowData(tableState.getRequiredRowType().getFieldCount());
                        final String recordKey = hoodieRecord.getRecordKey();
                        final String[] pkFields = KeyGenUtils.extractRecordKeys(recordKey);
                        final Object[] converted = converter.convert(pkFields);
                        for (int i = 0; i < pkOffset.length; i++) {
                            delete.setField(pkOffset[i], converted[i]);
                        }
                        delete.setRowKind(RowKind.DELETE);
                        this.currentRecord = delete;
                        return true;
                    }
                // skipping if the condition is unsatisfied
                // continue;
                } else {
                    final IndexedRecord avroRecord = curAvroRecord.get();
                    final RowKind rowKind = FormatUtils.getRowKindSafely(avroRecord, tableState.getOperationPos());
                    if (rowKind == RowKind.DELETE && !emitDelete) {
                        // skip the delete record
                        continue;
                    }
                    GenericRecord requiredAvroRecord = buildAvroRecordBySchema(avroRecord, requiredSchema, requiredPos, recordBuilder);
                    currentRecord = (RowData) avroToRowDataConverter.convert(requiredAvroRecord);
                    currentRecord.setRowKind(rowKind);
                    return true;
                }
            }
            return false;
        }

        @Override
        public RowData next() {
            return currentRecord;
        }

        @Override
        public void close() {
            scanner.close();
        }
    };
}
Also used : IndexedRecord(org.apache.avro.generic.IndexedRecord) FormatUtils.buildAvroRecordBySchema(org.apache.hudi.table.format.FormatUtils.buildAvroRecordBySchema) Schema(org.apache.avro.Schema) LogicalType(org.apache.flink.table.types.logical.LogicalType) StringToRowDataConverter(org.apache.hudi.util.StringToRowDataConverter) HoodieException(org.apache.hudi.exception.HoodieException) AvroToRowDataConverters(org.apache.hudi.util.AvroToRowDataConverters) GenericRowData(org.apache.flink.table.data.GenericRowData) RowData(org.apache.flink.table.data.RowData) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRowData(org.apache.flink.table.data.GenericRowData) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) IOException(java.io.IOException) RowKind(org.apache.flink.types.RowKind)

Example 20 with HoodieMergedLogRecordScanner

use of org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner in project hudi by apache.

the class TestData method checkWrittenDataMOR.

/**
 * Checks the MERGE_ON_READ source data are written as expected.
 *
 * <p>Note: Replace it with the Flink reader when it is supported.
 *
 * @param fs            The file system
 * @param latestInstant The latest committed instant of current table
 * @param baseFile      The file base to check, should be a directory
 * @param expected      The expected results mapping, the key should be the partition path
 * @param partitions    The expected partition number
 * @param schema        The read schema
 */
public static void checkWrittenDataMOR(FileSystem fs, String latestInstant, File baseFile, Map<String, String> expected, int partitions, Schema schema) {
    assert baseFile.isDirectory() : "Base path should be a directory";
    FileFilter partitionFilter = file -> !file.getName().startsWith(".");
    File[] partitionDirs = baseFile.listFiles(partitionFilter);
    assertNotNull(partitionDirs);
    assertThat(partitionDirs.length, is(partitions));
    for (File partitionDir : partitionDirs) {
        File[] dataFiles = partitionDir.listFiles(file -> file.getName().contains(".log.") && !file.getName().startsWith(".."));
        assertNotNull(dataFiles);
        HoodieMergedLogRecordScanner scanner = getScanner(fs, baseFile.getPath(), Arrays.stream(dataFiles).map(File::getAbsolutePath).sorted(Comparator.naturalOrder()).collect(Collectors.toList()), schema, latestInstant);
        List<String> readBuffer = scanner.getRecords().values().stream().map(hoodieRecord -> {
            try {
                // in case it is a delete
                GenericRecord record = (GenericRecord) hoodieRecord.getData().getInsertValue(schema, new Properties()).orElse(null);
                return record == null ? (String) null : filterOutVariables(record);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }).filter(Objects::nonNull).sorted(Comparator.naturalOrder()).collect(Collectors.toList());
        assertThat(readBuffer.toString(), is(expected.get(partitionDir.getName())));
    }
}
Also used : CoreMatchers.is(org.hamcrest.CoreMatchers.is) Arrays(java.util.Arrays) BinaryWriter(org.apache.flink.table.data.writer.BinaryWriter) DataStructureConverters(org.apache.flink.table.data.conversion.DataStructureConverters) FileSystem(org.apache.hadoop.fs.FileSystem) StreamWriteFunctionWrapper(org.apache.hudi.sink.utils.StreamWriteFunctionWrapper) BinaryRowWriter(org.apache.flink.table.data.writer.BinaryRowWriter) InternalSerializers(org.apache.flink.table.runtime.typeutils.InternalSerializers) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Schema(org.apache.avro.Schema) ParquetReader(org.apache.parquet.hadoop.ParquetReader) Collectors(java.util.stream.Collectors) Objects(java.util.Objects) List(java.util.List) LogicalType(org.apache.flink.table.types.logical.LogicalType) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) Row(org.apache.flink.types.Row) FlinkOptions(org.apache.hudi.configuration.FlinkOptions) TestCase.assertEquals(junit.framework.TestCase.assertEquals) IntStream(java.util.stream.IntStream) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) RowType(org.apache.flink.table.types.logical.RowType) HoodieFlinkEngineContext(org.apache.hudi.client.common.HoodieFlinkEngineContext) ArrayList(java.util.ArrayList) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) RowData(org.apache.flink.table.data.RowData) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) TimestampData(org.apache.flink.table.data.TimestampData) Configuration(org.apache.flink.configuration.Configuration) IOException(java.io.IOException) BinaryRowData(org.apache.flink.table.data.binary.BinaryRowData) HoodieFlinkTable(org.apache.hudi.table.HoodieFlinkTable) File(java.io.File) StringData(org.apache.flink.table.data.StringData) FileFilter(java.io.FileFilter) DataStructureConverter(org.apache.flink.table.data.conversion.DataStructureConverter) RowKind(org.apache.flink.types.RowKind) Strings(org.apache.parquet.Strings) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Comparator(java.util.Comparator) HoodieCommonConfig(org.apache.hudi.common.config.HoodieCommonConfig) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) OperatorEvent(org.apache.flink.runtime.operators.coordination.OperatorEvent) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Objects(java.util.Objects) IOException(java.io.IOException) FileFilter(java.io.FileFilter) GenericRecord(org.apache.avro.generic.GenericRecord) Properties(java.util.Properties) File(java.io.File)

Aggregations

Schema (org.apache.avro.Schema)20 HoodieMergedLogRecordScanner (org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner)20 Path (org.apache.hadoop.fs.Path)19 IOException (java.io.IOException)18 ArrayList (java.util.ArrayList)17 List (java.util.List)17 IndexedRecord (org.apache.avro.generic.IndexedRecord)16 FSUtils (org.apache.hudi.common.fs.FSUtils)16 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)16 Option (org.apache.hudi.common.util.Option)16 Map (java.util.Map)15 Collectors (java.util.stream.Collectors)15 GenericRecord (org.apache.avro.generic.GenericRecord)15 FileSystem (org.apache.hadoop.fs.FileSystem)15 HoodieAvroUtils (org.apache.hudi.avro.HoodieAvroUtils)15 Collections (java.util.Collections)14 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)14 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)14 HashMap (java.util.HashMap)13 HoodieTableType (org.apache.hudi.common.model.HoodieTableType)13