Search in sources :

Example 31 with HoodieDataBlock

use of org.apache.hudi.common.table.log.block.HoodieDataBlock in project hudi by apache.

the class HoodieLogFileCommand method showLogFileRecords.

@CliCommand(value = "show logfile records", help = "Read records from log files")
public String showLogFileRecords(@CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "10") final Integer limit, @CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified paths for the log files") final String logFilePathPattern, @CliOption(key = "mergeRecords", help = "If the records in the log files should be merged", unspecifiedDefaultValue = "false") final Boolean shouldMerge) throws IOException {
    System.out.println("===============> Showing only " + limit + " records <===============");
    HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
    FileSystem fs = client.getFs();
    List<String> logFilePaths = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(logFilePathPattern)).stream().map(status -> status.getPath().toString()).sorted(Comparator.reverseOrder()).collect(Collectors.toList());
    // logFilePaths size must > 1
    assert logFilePaths.size() > 0 : "There is no log file";
    // TODO : readerSchema can change across blocks/log files, fix this inside Scanner
    AvroSchemaConverter converter = new AvroSchemaConverter();
    // get schema from last log file
    Schema readerSchema = converter.convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(fs, new Path(logFilePaths.get(logFilePaths.size() - 1)))));
    List<IndexedRecord> allRecords = new ArrayList<>();
    if (shouldMerge) {
        System.out.println("===========================> MERGING RECORDS <===================");
        HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(client.getBasePath()).withLogFilePaths(logFilePaths).withReaderSchema(readerSchema).withLatestInstantTime(client.getActiveTimeline().getCommitTimeline().lastInstant().get().getTimestamp()).withReadBlocksLazily(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_LAZY_BLOCK_READ_ENABLE.defaultValue())).withReverseReader(Boolean.parseBoolean(HoodieCompactionConfig.COMPACTION_REVERSE_LOG_READ_ENABLE.defaultValue())).withBufferSize(HoodieMemoryConfig.MAX_DFS_STREAM_BUFFER_SIZE.defaultValue()).withMaxMemorySizeInBytes(HoodieMemoryConfig.DEFAULT_MAX_MEMORY_FOR_SPILLABLE_MAP_IN_BYTES).withSpillableMapBasePath(HoodieMemoryConfig.SPILLABLE_MAP_BASE_PATH.defaultValue()).withDiskMapType(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.defaultValue()).withBitCaskDiskMapCompressionEnabled(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.defaultValue()).build();
        for (HoodieRecord<? extends HoodieRecordPayload> hoodieRecord : scanner) {
            Option<IndexedRecord> record = hoodieRecord.getData().getInsertValue(readerSchema);
            if (allRecords.size() < limit) {
                allRecords.add(record.get());
            }
        }
    } else {
        for (String logFile : logFilePaths) {
            Schema writerSchema = new AvroSchemaConverter().convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(client.getFs(), new Path(logFile))));
            HoodieLogFormat.Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(new Path(logFile)), writerSchema);
            // read the avro blocks
            while (reader.hasNext()) {
                HoodieLogBlock n = reader.next();
                if (n instanceof HoodieDataBlock) {
                    HoodieDataBlock blk = (HoodieDataBlock) n;
                    try (ClosableIterator<IndexedRecord> recordItr = blk.getRecordItr()) {
                        recordItr.forEachRemaining(record -> {
                            if (allRecords.size() < limit) {
                                allRecords.add(record);
                            }
                        });
                    }
                }
            }
            reader.close();
            if (allRecords.size() >= limit) {
                break;
            }
        }
    }
    String[][] rows = new String[allRecords.size()][];
    int i = 0;
    for (IndexedRecord record : allRecords) {
        String[] data = new String[1];
        data[0] = record.toString();
        rows[i] = data;
        i++;
    }
    return HoodiePrintHelper.print(new String[] { HoodieTableHeaderFields.HEADER_RECORDS }, rows);
}
Also used : Path(org.apache.hadoop.fs.Path) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 32 with HoodieDataBlock

use of org.apache.hudi.common.table.log.block.HoodieDataBlock in project hudi by apache.

the class HoodieLogFileCommand method showLogFileCommits.

@CliCommand(value = "show logfile metadata", help = "Read commit metadata from log files")
public String showLogFileCommits(@CliOption(key = "logFilePathPattern", mandatory = true, help = "Fully qualified path for the log file") final String logFilePathPattern, @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
    FileSystem fs = HoodieCLI.getTableMetaClient().getFs();
    List<String> logFilePaths = FSUtils.getGlobStatusExcludingMetaFolder(fs, new Path(logFilePathPattern)).stream().map(status -> status.getPath().toString()).collect(Collectors.toList());
    Map<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> commitCountAndMetadata = new HashMap<>();
    int numCorruptBlocks = 0;
    int dummyInstantTimeCount = 0;
    for (String logFilePath : logFilePaths) {
        FileStatus[] fsStatus = fs.listStatus(new Path(logFilePath));
        Schema writerSchema = new AvroSchemaConverter().convert(Objects.requireNonNull(TableSchemaResolver.readSchemaFromLogFile(fs, new Path(logFilePath))));
        Reader reader = HoodieLogFormat.newReader(fs, new HoodieLogFile(fsStatus[0].getPath()), writerSchema);
        // read the avro blocks
        while (reader.hasNext()) {
            HoodieLogBlock n = reader.next();
            String instantTime;
            AtomicInteger recordCount = new AtomicInteger(0);
            if (n instanceof HoodieCorruptBlock) {
                try {
                    instantTime = n.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME);
                    if (instantTime == null) {
                        throw new Exception("Invalid instant time " + instantTime);
                    }
                } catch (Exception e) {
                    numCorruptBlocks++;
                    instantTime = "corrupt_block_" + numCorruptBlocks;
                // could not read metadata for corrupt block
                }
            } else {
                instantTime = n.getLogBlockHeader().get(HeaderMetadataType.INSTANT_TIME);
                if (instantTime == null) {
                    // This can happen when reading archived commit files since they were written without any instant time
                    dummyInstantTimeCount++;
                    instantTime = "dummy_instant_time_" + dummyInstantTimeCount;
                }
                if (n instanceof HoodieDataBlock) {
                    try (ClosableIterator<IndexedRecord> recordItr = ((HoodieDataBlock) n).getRecordItr()) {
                        recordItr.forEachRemaining(r -> recordCount.incrementAndGet());
                    }
                }
            }
            if (commitCountAndMetadata.containsKey(instantTime)) {
                commitCountAndMetadata.get(instantTime).add(new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount.get()));
            } else {
                List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>> list = new ArrayList<>();
                list.add(new Tuple3<>(n.getBlockType(), new Tuple2<>(n.getLogBlockHeader(), n.getLogBlockFooter()), recordCount.get()));
                commitCountAndMetadata.put(instantTime, list);
            }
        }
        reader.close();
    }
    List<Comparable[]> rows = new ArrayList<>();
    ObjectMapper objectMapper = new ObjectMapper();
    for (Map.Entry<String, List<Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer>>> entry : commitCountAndMetadata.entrySet()) {
        String instantTime = entry.getKey();
        for (Tuple3<HoodieLogBlockType, Tuple2<Map<HeaderMetadataType, String>, Map<HeaderMetadataType, String>>, Integer> tuple3 : entry.getValue()) {
            Comparable[] output = new Comparable[5];
            output[0] = instantTime;
            output[1] = tuple3._3();
            output[2] = tuple3._1().toString();
            output[3] = objectMapper.writeValueAsString(tuple3._2()._1());
            output[4] = objectMapper.writeValueAsString(tuple3._2()._2());
            rows.add(output);
        }
    }
    TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_INSTANT_TIME).addTableHeaderField(HoodieTableHeaderFields.HEADER_RECORD_COUNT).addTableHeaderField(HoodieTableHeaderFields.HEADER_BLOCK_TYPE).addTableHeaderField(HoodieTableHeaderFields.HEADER_HEADER_METADATA).addTableHeaderField(HoodieTableHeaderFields.HEADER_FOOTER_METADATA);
    return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, rows);
}
Also used : HoodieCorruptBlock(org.apache.hudi.common.table.log.block.HoodieCorruptBlock) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) FileSystem(org.apache.hadoop.fs.FileSystem) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) HoodieTableHeaderFields(org.apache.hudi.cli.HoodieTableHeaderFields) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) FileStatus(org.apache.hadoop.fs.FileStatus) CliOption(org.springframework.shell.core.annotation.CliOption) ArrayList(java.util.ArrayList) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) Map(java.util.Map) HoodieMemoryConfig(org.apache.hudi.config.HoodieMemoryConfig) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) IndexedRecord(org.apache.avro.generic.IndexedRecord) CommandMarker(org.springframework.shell.core.CommandMarker) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) CliCommand(org.springframework.shell.core.annotation.CliCommand) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) HoodieLogBlockType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType) TableHeader(org.apache.hudi.cli.TableHeader) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) IOException(java.io.IOException) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) Tuple3(scala.Tuple3) HoodieCLI(org.apache.hudi.cli.HoodieCLI) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) Objects(java.util.Objects) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) Component(org.springframework.stereotype.Component) List(java.util.List) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) Comparator(java.util.Comparator) HoodieCommonConfig(org.apache.hudi.common.config.HoodieCommonConfig) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) FSUtils(org.apache.hudi.common.fs.FSUtils) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) IndexedRecord(org.apache.avro.generic.IndexedRecord) HashMap(java.util.HashMap) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) HoodieLogBlockType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) List(java.util.List) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Path(org.apache.hadoop.fs.Path) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) TableHeader(org.apache.hudi.cli.TableHeader) IOException(java.io.IOException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HoodieCorruptBlock(org.apache.hudi.common.table.log.block.HoodieCorruptBlock) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Tuple2(scala.Tuple2) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Tuple3(scala.Tuple3) HashMap(java.util.HashMap) Map(java.util.Map) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Aggregations

HoodieDataBlock (org.apache.hudi.common.table.log.block.HoodieDataBlock)32 IndexedRecord (org.apache.avro.generic.IndexedRecord)30 HashMap (java.util.HashMap)27 HoodieLogBlock (org.apache.hudi.common.table.log.block.HoodieLogBlock)26 HeaderMetadataType (org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType)26 Writer (org.apache.hudi.common.table.log.HoodieLogFormat.Writer)25 Schema (org.apache.avro.Schema)24 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)24 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)23 Reader (org.apache.hudi.common.table.log.HoodieLogFormat.Reader)23 HoodieLogFormat (org.apache.hudi.common.table.log.HoodieLogFormat)22 Path (org.apache.hadoop.fs.Path)21 ArrayList (java.util.ArrayList)20 FileStatus (org.apache.hadoop.fs.FileStatus)20 HoodieLogFileReader (org.apache.hudi.common.table.log.HoodieLogFileReader)20 GenericRecord (org.apache.avro.generic.GenericRecord)19 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)19 FileSystem (org.apache.hadoop.fs.FileSystem)19 List (java.util.List)18 SchemaTestUtil.getSimpleSchema (org.apache.hudi.common.testutils.SchemaTestUtil.getSimpleSchema)18