Search in sources :

Example 21 with HoodieLogFile

use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.

the class RealtimeSplit method readFromInput.

default void readFromInput(DataInput in) throws IOException {
    setBasePath(InputSplitUtils.readString(in));
    setMaxCommitTime(InputSplitUtils.readString(in));
    setBelongsToIncrementalQuery(InputSplitUtils.readBoolean(in));
    int totalLogFiles = in.readInt();
    List<HoodieLogFile> deltaLogPaths = new ArrayList<>(totalLogFiles);
    for (int i = 0; i < totalLogFiles; i++) {
        String logFilePath = InputSplitUtils.readString(in);
        long logFileSize = in.readLong();
        deltaLogPaths.add(new HoodieLogFile(new Path(logFilePath), logFileSize));
    }
    setDeltaLogFiles(deltaLogPaths);
    boolean hoodieVirtualKeyPresent = InputSplitUtils.readBoolean(in);
    if (hoodieVirtualKeyPresent) {
        String recordKeyField = InputSplitUtils.readString(in);
        String partitionPathField = InputSplitUtils.readString(in);
        int recordFieldIndex = Integer.parseInt(InputSplitUtils.readString(in));
        int partitionPathIndex = Integer.parseInt(InputSplitUtils.readString(in));
        setVirtualKeyInfo(Option.of(new HoodieVirtualKeyInfo(recordKeyField, partitionPathField, recordFieldIndex, partitionPathIndex)));
    }
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 22 with HoodieLogFile

use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.

the class RealtimeSplit method writeToOutput.

default void writeToOutput(DataOutput out) throws IOException {
    InputSplitUtils.writeString(getBasePath(), out);
    InputSplitUtils.writeString(getMaxCommitTime(), out);
    InputSplitUtils.writeBoolean(getBelongsToIncrementalQuery(), out);
    out.writeInt(getDeltaLogFiles().size());
    for (HoodieLogFile logFile : getDeltaLogFiles()) {
        InputSplitUtils.writeString(logFile.getPath().toString(), out);
        out.writeLong(logFile.getFileSize());
    }
    Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt = getVirtualKeyInfo();
    if (!virtualKeyInfoOpt.isPresent()) {
        InputSplitUtils.writeBoolean(false, out);
    } else {
        InputSplitUtils.writeBoolean(true, out);
        InputSplitUtils.writeString(virtualKeyInfoOpt.get().getRecordKeyField(), out);
        InputSplitUtils.writeString(virtualKeyInfoOpt.get().getPartitionPathField(), out);
        InputSplitUtils.writeString(String.valueOf(virtualKeyInfoOpt.get().getRecordKeyFieldIndex()), out);
        InputSplitUtils.writeString(String.valueOf(virtualKeyInfoOpt.get().getPartitionPathFieldIndex()), out);
    }
}
Also used : HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Example 23 with HoodieLogFile

use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.

the class FileSystemViewCommand method showLatestFileSlices.

@CliCommand(value = "show fsview latest", help = "Show latest file-system view")
public String showLatestFileSlices(@CliOption(key = { "partitionPath" }, help = "A valid partition path", mandatory = true) String partition, @CliOption(key = { "baseFileOnly" }, help = "Only display base file view", unspecifiedDefaultValue = "false") boolean baseFileOnly, @CliOption(key = { "maxInstant" }, help = "File-Slices upto this instant are displayed", unspecifiedDefaultValue = "") String maxInstant, @CliOption(key = { "merge" }, help = "Merge File Slices due to pending compaction", unspecifiedDefaultValue = "true") final boolean merge, @CliOption(key = { "includeMax" }, help = "Include Max Instant", unspecifiedDefaultValue = "false") boolean includeMaxInstant, @CliOption(key = { "includeInflight" }, help = "Include Inflight Instants", unspecifiedDefaultValue = "false") boolean includeInflight, @CliOption(key = { "excludeCompaction" }, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false") boolean excludeCompaction, @CliOption(key = { "limit" }, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
    HoodieTableFileSystemView fsView = buildFileSystemView(partition, maxInstant, baseFileOnly, includeMaxInstant, includeInflight, excludeCompaction);
    List<Comparable[]> rows = new ArrayList<>();
    final Stream<FileSlice> fileSliceStream;
    if (!merge) {
        fileSliceStream = fsView.getLatestFileSlices(partition);
    } else {
        if (maxInstant.isEmpty()) {
            maxInstant = HoodieCLI.getTableMetaClient().getActiveTimeline().filterCompletedAndCompactionInstants().lastInstant().get().getTimestamp();
        }
        fileSliceStream = fsView.getLatestMergedFileSlicesBeforeOrOn(partition, maxInstant);
    }
    fileSliceStream.forEach(fs -> {
        int idx = 0;
        Comparable[] row = new Comparable[baseFileOnly ? 5 : 13];
        row[idx++] = partition;
        row[idx++] = fs.getFileId();
        row[idx++] = fs.getBaseInstantTime();
        row[idx++] = fs.getBaseFile().isPresent() ? fs.getBaseFile().get().getPath() : "";
        long dataFileSize = fs.getBaseFile().isPresent() ? fs.getBaseFile().get().getFileSize() : -1;
        row[idx++] = dataFileSize;
        if (!baseFileOnly) {
            row[idx++] = fs.getLogFiles().count();
            row[idx++] = fs.getLogFiles().mapToLong(HoodieLogFile::getFileSize).sum();
            long logFilesScheduledForCompactionTotalSize = fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).mapToLong(HoodieLogFile::getFileSize).sum();
            row[idx++] = logFilesScheduledForCompactionTotalSize;
            long logFilesUnscheduledTotalSize = fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).mapToLong(HoodieLogFile::getFileSize).sum();
            row[idx++] = logFilesUnscheduledTotalSize;
            double logSelectedForCompactionToBaseRatio = dataFileSize > 0 ? logFilesScheduledForCompactionTotalSize / (dataFileSize * 1.0) : -1;
            row[idx++] = logSelectedForCompactionToBaseRatio;
            double logUnscheduledToBaseRatio = dataFileSize > 0 ? logFilesUnscheduledTotalSize / (dataFileSize * 1.0) : -1;
            row[idx++] = logUnscheduledToBaseRatio;
            row[idx++] = fs.getLogFiles().filter(lf -> lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).collect(Collectors.toList()).toString();
            row[idx++] = fs.getLogFiles().filter(lf -> !lf.getBaseCommitTime().equals(fs.getBaseInstantTime())).collect(Collectors.toList()).toString();
        }
        rows.add(row);
    });
    Function<Object, String> converterFunction = entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString())));
    Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
    fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_DATA_FILE_SIZE, converterFunction);
    if (!baseFileOnly) {
        fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_DELTA_SIZE, converterFunction);
        fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_DELTA_SIZE_SCHEDULED, converterFunction);
        fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_DELTA_SIZE_UNSCHEDULED, converterFunction);
    }
    TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION).addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID).addTableHeaderField(HoodieTableHeaderFields.HEADER_BASE_INSTANT).addTableHeaderField(HoodieTableHeaderFields.HEADER_DATA_FILE).addTableHeaderField(HoodieTableHeaderFields.HEADER_DATA_FILE_SIZE);
    if (!baseFileOnly) {
        header = header.addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_DELTA_FILES).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_DELTA_SIZE).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_SIZE_SCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_SIZE_UNSCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_BASE_SCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_BASE_UNSCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES_SCHEDULED).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES_UNSCHEDULED);
    }
    return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
}
Also used : FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTableHeaderFields(org.apache.hudi.cli.HoodieTableHeaderFields) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) FileStatus(org.apache.hadoop.fs.FileStatus) CliOption(org.springframework.shell.core.annotation.CliOption) Function(java.util.function.Function) ArrayList(java.util.ArrayList) BiPredicate(java.util.function.BiPredicate) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) CommandMarker(org.springframework.shell.core.CommandMarker) CliCommand(org.springframework.shell.core.annotation.CliCommand) TableHeader(org.apache.hudi.cli.TableHeader) IOException(java.io.IOException) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Serializable(java.io.Serializable) HoodieCLI(org.apache.hudi.cli.HoodieCLI) Component(org.springframework.stereotype.Component) List(java.util.List) Stream(java.util.stream.Stream) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) FSUtils(org.apache.hudi.common.fs.FSUtils) NumericUtils(org.apache.hudi.common.util.NumericUtils) TableHeader(org.apache.hudi.cli.TableHeader) HashMap(java.util.HashMap) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) Function(java.util.function.Function) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 24 with HoodieLogFile

use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.

the class ArchivedCommitsCommand method showArchivedCommits.

@CliCommand(value = "show archived commit stats", help = "Read commits from archived files and show details")
public String showArchivedCommits(@CliOption(key = { "archiveFolderPattern" }, help = "Archive Folder", unspecifiedDefaultValue = "") String folder, @CliOption(key = { "limit" }, help = "Limit commits", unspecifiedDefaultValue = "-1") final Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
    System.out.println("===============> Showing only " + limit + " archived commits <===============");
    String basePath = HoodieCLI.getTableMetaClient().getBasePath();
    Path archivePath = new Path(HoodieCLI.getTableMetaClient().getArchivePath() + "/.commits_.archive*");
    if (folder != null && !folder.isEmpty()) {
        archivePath = new Path(basePath + "/.hoodie/" + folder);
    }
    FileStatus[] fsStatuses = FSUtils.getFs(basePath, HoodieCLI.conf).globStatus(archivePath);
    List<Comparable[]> allStats = new ArrayList<>();
    for (FileStatus fs : fsStatuses) {
        // read the archived file
        Reader reader = HoodieLogFormat.newReader(FSUtils.getFs(basePath, HoodieCLI.conf), new HoodieLogFile(fs.getPath()), HoodieArchivedMetaEntry.getClassSchema());
        List<IndexedRecord> readRecords = new ArrayList<>();
        // read the avro blocks
        while (reader.hasNext()) {
            HoodieAvroDataBlock blk = (HoodieAvroDataBlock) reader.next();
            blk.getRecordItr().forEachRemaining(readRecords::add);
        }
        List<Comparable[]> readCommits = readRecords.stream().map(r -> (GenericRecord) r).filter(r -> r.get("actionType").toString().equals(HoodieTimeline.COMMIT_ACTION) || r.get("actionType").toString().equals(HoodieTimeline.DELTA_COMMIT_ACTION)).flatMap(r -> {
            HoodieCommitMetadata metadata = (HoodieCommitMetadata) SpecificData.get().deepCopy(HoodieCommitMetadata.SCHEMA$, r.get("hoodieCommitMetadata"));
            final String instantTime = r.get("commitTime").toString();
            final String action = r.get("actionType").toString();
            return metadata.getPartitionToWriteStats().values().stream().flatMap(hoodieWriteStats -> hoodieWriteStats.stream().map(hoodieWriteStat -> {
                List<Comparable> row = new ArrayList<>();
                row.add(action);
                row.add(instantTime);
                row.add(hoodieWriteStat.getPartitionPath());
                row.add(hoodieWriteStat.getFileId());
                row.add(hoodieWriteStat.getPrevCommit());
                row.add(hoodieWriteStat.getNumWrites());
                row.add(hoodieWriteStat.getNumInserts());
                row.add(hoodieWriteStat.getNumDeletes());
                row.add(hoodieWriteStat.getNumUpdateWrites());
                row.add(hoodieWriteStat.getTotalLogFiles());
                row.add(hoodieWriteStat.getTotalLogBlocks());
                row.add(hoodieWriteStat.getTotalCorruptLogBlock());
                row.add(hoodieWriteStat.getTotalRollbackBlocks());
                row.add(hoodieWriteStat.getTotalLogRecords());
                row.add(hoodieWriteStat.getTotalUpdatedRecordsCompacted());
                row.add(hoodieWriteStat.getTotalWriteBytes());
                row.add(hoodieWriteStat.getTotalWriteErrors());
                return row;
            })).map(rowList -> rowList.toArray(new Comparable[0]));
        }).collect(Collectors.toList());
        allStats.addAll(readCommits);
        reader.close();
    }
    TableHeader header = new TableHeader().addTableHeaderField("action").addTableHeaderField("instant").addTableHeaderField("partition").addTableHeaderField("file_id").addTableHeaderField("prev_instant").addTableHeaderField("num_writes").addTableHeaderField("num_inserts").addTableHeaderField("num_deletes").addTableHeaderField("num_update_writes").addTableHeaderField("total_log_files").addTableHeaderField("total_log_blocks").addTableHeaderField("total_corrupt_log_blocks").addTableHeaderField("total_rollback_blocks").addTableHeaderField("total_log_records").addTableHeaderField("total_updated_records_compacted").addTableHeaderField("total_write_bytes").addTableHeaderField("total_write_errors");
    return HoodiePrintHelper.print(header, new HashMap<>(), sortByField, descending, limit, headerOnly, allStats);
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieArchivedMetaEntry(org.apache.hudi.avro.model.HoodieArchivedMetaEntry) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) FileStatus(org.apache.hadoop.fs.FileStatus) CliOption(org.springframework.shell.core.annotation.CliOption) ArrayList(java.util.ArrayList) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) SpecificData(org.apache.avro.specific.SpecificData) CommandMarker(org.springframework.shell.core.CommandMarker) GenericRecord(org.apache.avro.generic.GenericRecord) CliCommand(org.springframework.shell.core.annotation.CliCommand) TableHeader(org.apache.hudi.cli.TableHeader) IOException(java.io.IOException) HoodieCommitMetadata(org.apache.hudi.avro.model.HoodieCommitMetadata) Collectors(java.util.stream.Collectors) HoodieCLI(org.apache.hudi.cli.HoodieCLI) Component(org.springframework.stereotype.Component) List(java.util.List) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) FSUtils(org.apache.hudi.common.fs.FSUtils) FileStatus(org.apache.hadoop.fs.FileStatus) IndexedRecord(org.apache.avro.generic.IndexedRecord) TableHeader(org.apache.hudi.cli.TableHeader) ArrayList(java.util.ArrayList) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) Reader(org.apache.hudi.common.table.log.HoodieLogFormat.Reader) HoodieCommitMetadata(org.apache.hudi.avro.model.HoodieCommitMetadata) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) ArrayList(java.util.ArrayList) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 25 with HoodieLogFile

use of org.apache.hudi.common.model.HoodieLogFile in project hudi by apache.

the class AbstractHoodieLogRecordReader method scan.

public synchronized void scan(Option<List<String>> keys) {
    currentInstantLogBlocks = new ArrayDeque<>();
    progress = 0.0f;
    totalLogFiles = new AtomicLong(0);
    totalRollbacks = new AtomicLong(0);
    totalCorruptBlocks = new AtomicLong(0);
    totalLogBlocks = new AtomicLong(0);
    totalLogRecords = new AtomicLong(0);
    HoodieLogFormatReader logFormatReaderWrapper = null;
    HoodieTimeline commitsTimeline = this.hoodieTableMetaClient.getCommitsTimeline();
    HoodieTimeline completedInstantsTimeline = commitsTimeline.filterCompletedInstants();
    HoodieTimeline inflightInstantsTimeline = commitsTimeline.filterInflights();
    try {
        // Get the key field based on populate meta fields config
        // and the table type
        final String keyField = getKeyField();
        // Iterate over the paths
        logFormatReaderWrapper = new HoodieLogFormatReader(fs, logFilePaths.stream().map(logFile -> new HoodieLogFile(new Path(logFile))).collect(Collectors.toList()), readerSchema, readBlocksLazily, reverseReader, bufferSize, !enableFullScan, keyField);
        Set<HoodieLogFile> scannedLogFiles = new HashSet<>();
        while (logFormatReaderWrapper.hasNext()) {
            HoodieLogFile logFile = logFormatReaderWrapper.getLogFile();
            LOG.info("Scanning log file " + logFile);
            scannedLogFiles.add(logFile);
            totalLogFiles.set(scannedLogFiles.size());
            // Use the HoodieLogFileReader to iterate through the blocks in the log file
            HoodieLogBlock logBlock = logFormatReaderWrapper.next();
            final String instantTime = logBlock.getLogBlockHeader().get(INSTANT_TIME);
            totalLogBlocks.incrementAndGet();
            if (logBlock.getBlockType() != CORRUPT_BLOCK && !HoodieTimeline.compareTimestamps(logBlock.getLogBlockHeader().get(INSTANT_TIME), HoodieTimeline.LESSER_THAN_OR_EQUALS, this.latestInstantTime)) {
                // hit a block with instant time greater than should be processed, stop processing further
                break;
            }
            if (logBlock.getBlockType() != CORRUPT_BLOCK && logBlock.getBlockType() != COMMAND_BLOCK) {
                if (!completedInstantsTimeline.containsOrBeforeTimelineStarts(instantTime) || inflightInstantsTimeline.containsInstant(instantTime)) {
                    // hit an uncommitted block possibly from a failed write, move to the next one and skip processing this one
                    continue;
                }
                if (instantRange.isPresent() && !instantRange.get().isInRange(instantTime)) {
                    // filter the log block by instant range
                    continue;
                }
            }
            switch(logBlock.getBlockType()) {
                case HFILE_DATA_BLOCK:
                case AVRO_DATA_BLOCK:
                case PARQUET_DATA_BLOCK:
                    LOG.info("Reading a data block from file " + logFile.getPath() + " at instant " + logBlock.getLogBlockHeader().get(INSTANT_TIME));
                    if (isNewInstantBlock(logBlock) && !readBlocksLazily) {
                        // If this is an avro data block belonging to a different commit/instant,
                        // then merge the last blocks and records into the main result
                        processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
                    }
                    // store the current block
                    currentInstantLogBlocks.push(logBlock);
                    break;
                case DELETE_BLOCK:
                    LOG.info("Reading a delete block from file " + logFile.getPath());
                    if (isNewInstantBlock(logBlock) && !readBlocksLazily) {
                        // If this is a delete data block belonging to a different commit/instant,
                        // then merge the last blocks and records into the main result
                        processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
                    }
                    // store deletes so can be rolled back
                    currentInstantLogBlocks.push(logBlock);
                    break;
                case COMMAND_BLOCK:
                    // Consider the following scenario
                    // (Time 0, C1, Task T1) -> Running
                    // (Time 1, C1, Task T1) -> Failed (Wrote either a corrupt block or a correct
                    // DataBlock (B1) with commitTime C1
                    // (Time 2, C1, Task T1.2) -> Running (Task T1 was retried and the attempt number is 2)
                    // (Time 3, C1, Task T1.2) -> Finished (Wrote a correct DataBlock B2)
                    // Now a logFile L1 can have 2 correct Datablocks (B1 and B2) which are the same.
                    // Say, commit C1 eventually failed and a rollback is triggered.
                    // Rollback will write only 1 rollback block (R1) since it assumes one block is
                    // written per ingestion batch for a file but in reality we need to rollback (B1 & B2)
                    // The following code ensures the same rollback block (R1) is used to rollback
                    // both B1 & B2
                    LOG.info("Reading a command block from file " + logFile.getPath());
                    // This is a command block - take appropriate action based on the command
                    HoodieCommandBlock commandBlock = (HoodieCommandBlock) logBlock;
                    String targetInstantForCommandBlock = logBlock.getLogBlockHeader().get(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME);
                    switch(// there can be different types of command blocks
                    commandBlock.getType()) {
                        case ROLLBACK_PREVIOUS_BLOCK:
                            // Rollback the last read log block
                            // Get commit time from last record block, compare with targetCommitTime,
                            // rollback only if equal, this is required in scenarios of invalid/extra
                            // rollback blocks written due to failures during the rollback operation itself
                            // and ensures the same rollback block (R1) is used to rollback both B1 & B2 with
                            // same instant_time
                            int numBlocksRolledBack = 0;
                            totalRollbacks.incrementAndGet();
                            while (!currentInstantLogBlocks.isEmpty()) {
                                HoodieLogBlock lastBlock = currentInstantLogBlocks.peek();
                                // handle corrupt blocks separately since they may not have metadata
                                if (lastBlock.getBlockType() == CORRUPT_BLOCK) {
                                    LOG.info("Rolling back the last corrupted log block read in " + logFile.getPath());
                                    currentInstantLogBlocks.pop();
                                    numBlocksRolledBack++;
                                } else if (targetInstantForCommandBlock.contentEquals(lastBlock.getLogBlockHeader().get(INSTANT_TIME))) {
                                    // rollback last data block or delete block
                                    LOG.info("Rolling back the last log block read in " + logFile.getPath());
                                    currentInstantLogBlocks.pop();
                                    numBlocksRolledBack++;
                                } else if (!targetInstantForCommandBlock.contentEquals(currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME))) {
                                    // invalid or extra rollback block
                                    LOG.warn("TargetInstantTime " + targetInstantForCommandBlock + " invalid or extra rollback command block in " + logFile.getPath());
                                    break;
                                } else {
                                    // this should not happen ideally
                                    LOG.warn("Unable to apply rollback command block in " + logFile.getPath());
                                }
                            }
                            LOG.info("Number of applied rollback blocks " + numBlocksRolledBack);
                            break;
                        default:
                            throw new UnsupportedOperationException("Command type not yet supported.");
                    }
                    break;
                case CORRUPT_BLOCK:
                    LOG.info("Found a corrupt block in " + logFile.getPath());
                    totalCorruptBlocks.incrementAndGet();
                    // If there is a corrupt block - we will assume that this was the next data block
                    currentInstantLogBlocks.push(logBlock);
                    break;
                default:
                    throw new UnsupportedOperationException("Block type not supported yet");
            }
        }
        // merge the last read block when all the blocks are done reading
        if (!currentInstantLogBlocks.isEmpty()) {
            LOG.info("Merging the final data blocks");
            processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
        }
        // Done
        progress = 1.0f;
    } catch (IOException e) {
        LOG.error("Got IOException when reading log file", e);
        throw new HoodieIOException("IOException when reading log file ", e);
    } catch (Exception e) {
        LOG.error("Got exception when reading log file", e);
        throw new HoodieException("Exception when reading log file ", e);
    } finally {
        try {
            if (null != logFormatReaderWrapper) {
                logFormatReaderWrapper.close();
            }
        } catch (IOException ioe) {
            // Eat exception as we do not want to mask the original exception that can happen
            LOG.error("Unable to close log format reader", ioe);
        }
    }
}
Also used : Arrays(java.util.Arrays) HoodieHFileDataBlock(org.apache.hudi.common.table.log.block.HoodieHFileDataBlock) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) Deque(java.util.Deque) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieParquetDataBlock(org.apache.hudi.common.table.log.block.HoodieParquetDataBlock) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) CORRUPT_BLOCK(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.CORRUPT_BLOCK) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) COMMAND_BLOCK(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.COMMAND_BLOCK) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) INSTANT_TIME(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType.INSTANT_TIME) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) ArrayDeque(java.util.ArrayDeque) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) SpillableMapUtils(org.apache.hudi.common.util.SpillableMapUtils) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) AtomicLong(java.util.concurrent.atomic.AtomicLong) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HashSet(java.util.HashSet)

Aggregations

HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)77 IOException (java.io.IOException)48 List (java.util.List)46 Path (org.apache.hadoop.fs.Path)45 Map (java.util.Map)42 Collectors (java.util.stream.Collectors)42 ArrayList (java.util.ArrayList)38 Option (org.apache.hudi.common.util.Option)37 FileSlice (org.apache.hudi.common.model.FileSlice)34 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)29 FileStatus (org.apache.hadoop.fs.FileStatus)28 HashMap (java.util.HashMap)26 FSUtils (org.apache.hudi.common.fs.FSUtils)26 Pair (org.apache.hudi.common.util.collection.Pair)25 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)24 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)23 Set (java.util.Set)22 LogManager (org.apache.log4j.LogManager)22 Logger (org.apache.log4j.Logger)22 HoodieLogFormat (org.apache.hudi.common.table.log.HoodieLogFormat)21