Search in sources :

Example 46 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class AbstractHoodieLogRecordReader method scan.

public synchronized void scan(Option<List<String>> keys) {
    currentInstantLogBlocks = new ArrayDeque<>();
    progress = 0.0f;
    totalLogFiles = new AtomicLong(0);
    totalRollbacks = new AtomicLong(0);
    totalCorruptBlocks = new AtomicLong(0);
    totalLogBlocks = new AtomicLong(0);
    totalLogRecords = new AtomicLong(0);
    HoodieLogFormatReader logFormatReaderWrapper = null;
    HoodieTimeline commitsTimeline = this.hoodieTableMetaClient.getCommitsTimeline();
    HoodieTimeline completedInstantsTimeline = commitsTimeline.filterCompletedInstants();
    HoodieTimeline inflightInstantsTimeline = commitsTimeline.filterInflights();
    try {
        // Get the key field based on populate meta fields config
        // and the table type
        final String keyField = getKeyField();
        // Iterate over the paths
        logFormatReaderWrapper = new HoodieLogFormatReader(fs, logFilePaths.stream().map(logFile -> new HoodieLogFile(new Path(logFile))).collect(Collectors.toList()), readerSchema, readBlocksLazily, reverseReader, bufferSize, !enableFullScan, keyField);
        Set<HoodieLogFile> scannedLogFiles = new HashSet<>();
        while (logFormatReaderWrapper.hasNext()) {
            HoodieLogFile logFile = logFormatReaderWrapper.getLogFile();
            LOG.info("Scanning log file " + logFile);
            scannedLogFiles.add(logFile);
            totalLogFiles.set(scannedLogFiles.size());
            // Use the HoodieLogFileReader to iterate through the blocks in the log file
            HoodieLogBlock logBlock = logFormatReaderWrapper.next();
            final String instantTime = logBlock.getLogBlockHeader().get(INSTANT_TIME);
            totalLogBlocks.incrementAndGet();
            if (logBlock.getBlockType() != CORRUPT_BLOCK && !HoodieTimeline.compareTimestamps(logBlock.getLogBlockHeader().get(INSTANT_TIME), HoodieTimeline.LESSER_THAN_OR_EQUALS, this.latestInstantTime)) {
                // hit a block with instant time greater than should be processed, stop processing further
                break;
            }
            if (logBlock.getBlockType() != CORRUPT_BLOCK && logBlock.getBlockType() != COMMAND_BLOCK) {
                if (!completedInstantsTimeline.containsOrBeforeTimelineStarts(instantTime) || inflightInstantsTimeline.containsInstant(instantTime)) {
                    // hit an uncommitted block possibly from a failed write, move to the next one and skip processing this one
                    continue;
                }
                if (instantRange.isPresent() && !instantRange.get().isInRange(instantTime)) {
                    // filter the log block by instant range
                    continue;
                }
            }
            switch(logBlock.getBlockType()) {
                case HFILE_DATA_BLOCK:
                case AVRO_DATA_BLOCK:
                case PARQUET_DATA_BLOCK:
                    LOG.info("Reading a data block from file " + logFile.getPath() + " at instant " + logBlock.getLogBlockHeader().get(INSTANT_TIME));
                    if (isNewInstantBlock(logBlock) && !readBlocksLazily) {
                        // If this is an avro data block belonging to a different commit/instant,
                        // then merge the last blocks and records into the main result
                        processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
                    }
                    // store the current block
                    currentInstantLogBlocks.push(logBlock);
                    break;
                case DELETE_BLOCK:
                    LOG.info("Reading a delete block from file " + logFile.getPath());
                    if (isNewInstantBlock(logBlock) && !readBlocksLazily) {
                        // If this is a delete data block belonging to a different commit/instant,
                        // then merge the last blocks and records into the main result
                        processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
                    }
                    // store deletes so can be rolled back
                    currentInstantLogBlocks.push(logBlock);
                    break;
                case COMMAND_BLOCK:
                    // Consider the following scenario
                    // (Time 0, C1, Task T1) -> Running
                    // (Time 1, C1, Task T1) -> Failed (Wrote either a corrupt block or a correct
                    // DataBlock (B1) with commitTime C1
                    // (Time 2, C1, Task T1.2) -> Running (Task T1 was retried and the attempt number is 2)
                    // (Time 3, C1, Task T1.2) -> Finished (Wrote a correct DataBlock B2)
                    // Now a logFile L1 can have 2 correct Datablocks (B1 and B2) which are the same.
                    // Say, commit C1 eventually failed and a rollback is triggered.
                    // Rollback will write only 1 rollback block (R1) since it assumes one block is
                    // written per ingestion batch for a file but in reality we need to rollback (B1 & B2)
                    // The following code ensures the same rollback block (R1) is used to rollback
                    // both B1 & B2
                    LOG.info("Reading a command block from file " + logFile.getPath());
                    // This is a command block - take appropriate action based on the command
                    HoodieCommandBlock commandBlock = (HoodieCommandBlock) logBlock;
                    String targetInstantForCommandBlock = logBlock.getLogBlockHeader().get(HoodieLogBlock.HeaderMetadataType.TARGET_INSTANT_TIME);
                    switch(// there can be different types of command blocks
                    commandBlock.getType()) {
                        case ROLLBACK_PREVIOUS_BLOCK:
                            // Rollback the last read log block
                            // Get commit time from last record block, compare with targetCommitTime,
                            // rollback only if equal, this is required in scenarios of invalid/extra
                            // rollback blocks written due to failures during the rollback operation itself
                            // and ensures the same rollback block (R1) is used to rollback both B1 & B2 with
                            // same instant_time
                            int numBlocksRolledBack = 0;
                            totalRollbacks.incrementAndGet();
                            while (!currentInstantLogBlocks.isEmpty()) {
                                HoodieLogBlock lastBlock = currentInstantLogBlocks.peek();
                                // handle corrupt blocks separately since they may not have metadata
                                if (lastBlock.getBlockType() == CORRUPT_BLOCK) {
                                    LOG.info("Rolling back the last corrupted log block read in " + logFile.getPath());
                                    currentInstantLogBlocks.pop();
                                    numBlocksRolledBack++;
                                } else if (targetInstantForCommandBlock.contentEquals(lastBlock.getLogBlockHeader().get(INSTANT_TIME))) {
                                    // rollback last data block or delete block
                                    LOG.info("Rolling back the last log block read in " + logFile.getPath());
                                    currentInstantLogBlocks.pop();
                                    numBlocksRolledBack++;
                                } else if (!targetInstantForCommandBlock.contentEquals(currentInstantLogBlocks.peek().getLogBlockHeader().get(INSTANT_TIME))) {
                                    // invalid or extra rollback block
                                    LOG.warn("TargetInstantTime " + targetInstantForCommandBlock + " invalid or extra rollback command block in " + logFile.getPath());
                                    break;
                                } else {
                                    // this should not happen ideally
                                    LOG.warn("Unable to apply rollback command block in " + logFile.getPath());
                                }
                            }
                            LOG.info("Number of applied rollback blocks " + numBlocksRolledBack);
                            break;
                        default:
                            throw new UnsupportedOperationException("Command type not yet supported.");
                    }
                    break;
                case CORRUPT_BLOCK:
                    LOG.info("Found a corrupt block in " + logFile.getPath());
                    totalCorruptBlocks.incrementAndGet();
                    // If there is a corrupt block - we will assume that this was the next data block
                    currentInstantLogBlocks.push(logBlock);
                    break;
                default:
                    throw new UnsupportedOperationException("Block type not supported yet");
            }
        }
        // merge the last read block when all the blocks are done reading
        if (!currentInstantLogBlocks.isEmpty()) {
            LOG.info("Merging the final data blocks");
            processQueuedBlocksForInstant(currentInstantLogBlocks, scannedLogFiles.size(), keys);
        }
        // Done
        progress = 1.0f;
    } catch (IOException e) {
        LOG.error("Got IOException when reading log file", e);
        throw new HoodieIOException("IOException when reading log file ", e);
    } catch (Exception e) {
        LOG.error("Got exception when reading log file", e);
        throw new HoodieException("Exception when reading log file ", e);
    } finally {
        try {
            if (null != logFormatReaderWrapper) {
                logFormatReaderWrapper.close();
            }
        } catch (IOException ioe) {
            // Eat exception as we do not want to mask the original exception that can happen
            LOG.error("Unable to close log format reader", ioe);
        }
    }
}
Also used : Arrays(java.util.Arrays) HoodieHFileDataBlock(org.apache.hudi.common.table.log.block.HoodieHFileDataBlock) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) ClosableIterator(org.apache.hudi.common.util.ClosableIterator) Deque(java.util.Deque) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieDataBlock(org.apache.hudi.common.table.log.block.HoodieDataBlock) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieParquetDataBlock(org.apache.hudi.common.table.log.block.HoodieParquetDataBlock) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) CORRUPT_BLOCK(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.CORRUPT_BLOCK) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) COMMAND_BLOCK(org.apache.hudi.common.table.log.block.HoodieLogBlock.HoodieLogBlockType.COMMAND_BLOCK) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) INSTANT_TIME(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType.INSTANT_TIME) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) ArrayDeque(java.util.ArrayDeque) HoodieDeleteBlock(org.apache.hudi.common.table.log.block.HoodieDeleteBlock) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) SpillableMapUtils(org.apache.hudi.common.util.SpillableMapUtils) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieCommandBlock(org.apache.hudi.common.table.log.block.HoodieCommandBlock) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) AtomicLong(java.util.concurrent.atomic.AtomicLong) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HashSet(java.util.HashSet)

Example 47 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TableSchemaResolver method getLatestCommitMetadata.

/**
 * Get Last commit's Metadata.
 */
public Option<HoodieCommitMetadata> getLatestCommitMetadata() {
    try {
        HoodieTimeline timeline = metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
        if (timeline.lastInstant().isPresent()) {
            HoodieInstant instant = timeline.lastInstant().get();
            byte[] data = timeline.getInstantDetails(instant).get();
            return Option.of(HoodieCommitMetadata.fromBytes(data, HoodieCommitMetadata.class));
        } else {
            return Option.empty();
        }
    } catch (Exception e) {
        throw new HoodieException("Failed to get commit metadata", e);
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieException(org.apache.hudi.exception.HoodieException) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) InvalidTableException(org.apache.hudi.exception.InvalidTableException)

Example 48 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class HoodieTableMetadataUtil method getFileSystemView.

/**
 * Get metadata table file system view.
 *
 * @param metaClient - Metadata table meta client
 * @return Filesystem view for the metadata table
 */
public static HoodieTableFileSystemView getFileSystemView(HoodieTableMetaClient metaClient) {
    // If there are no commits on the metadata table then the table's
    // default FileSystemView will not return any file slices even
    // though we may have initialized them.
    HoodieTimeline timeline = metaClient.getActiveTimeline();
    if (timeline.empty()) {
        final HoodieInstant instant = new HoodieInstant(false, HoodieTimeline.DELTA_COMMIT_ACTION, HoodieActiveTimeline.createNewInstantTime());
        timeline = new HoodieDefaultTimeline(Stream.of(instant), metaClient.getActiveTimeline()::getInstantDetails);
    }
    return new HoodieTableFileSystemView(metaClient, timeline);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 49 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class ITTestHoodieDataSource method testStreamWriteWithCleaning.

@Test
void testStreamWriteWithCleaning() {
    // create filesystem table named source
    // the source generates 4 commits but the cleaning task
    // would always try to keep the remaining commits number as 1
    String createSource = TestConfigurations.getFileSourceDDL("source", "test_source_3.data", 4);
    streamTableEnv.executeSql(createSource);
    String hoodieTableDDL = sql("t1").option(FlinkOptions.PATH, tempFile.getAbsolutePath()).option(FlinkOptions.CLEAN_RETAIN_COMMITS, 1).end();
    streamTableEnv.executeSql(hoodieTableDDL);
    String insertInto = "insert into t1 select * from source";
    execInsertSql(streamTableEnv, insertInto);
    Configuration defaultConf = TestConfigurations.getDefaultConf(tempFile.getAbsolutePath());
    Map<String, String> options1 = new HashMap<>(defaultConf.toMap());
    options1.put(FlinkOptions.TABLE_NAME.key(), "t1");
    Configuration conf = Configuration.fromMap(options1);
    HoodieTimeline timeline = StreamerUtil.createMetaClient(conf).getActiveTimeline();
    assertTrue(timeline.filterCompletedInstants().getInstants().anyMatch(instant -> instant.getAction().equals("clean")), "some commits should be cleaned");
}
Also used : Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) StreamTableEnvironment(org.apache.flink.table.api.bridge.java.StreamTableEnvironment) TestConfigurations(org.apache.hudi.utils.TestConfigurations) DefaultHoodieRecordPayload(org.apache.hudi.common.model.DefaultHoodieRecordPayload) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) TestConfigurations.catalog(org.apache.hudi.utils.TestConfigurations.catalog) ObjectPath(org.apache.flink.table.catalog.ObjectPath) TableEnvironmentImpl(org.apache.flink.table.api.internal.TableEnvironmentImpl) TestData.assertRowsEquals(org.apache.hudi.utils.TestData.assertRowsEquals) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) TestUtils(org.apache.hudi.utils.TestUtils) Map(java.util.Map) StreamerUtil(org.apache.hudi.util.StreamerUtil) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) AbstractTestBase(org.apache.flink.test.util.AbstractTestBase) MethodSource(org.junit.jupiter.params.provider.MethodSource) ValueSource(org.junit.jupiter.params.provider.ValueSource) CollectSinkTableFactory(org.apache.hudi.utils.factory.CollectSinkTableFactory) TableEnvironment(org.apache.flink.table.api.TableEnvironment) TableNotExistException(org.apache.flink.table.catalog.exceptions.TableNotExistException) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) TableSchema(org.apache.flink.table.api.TableSchema) TestData(org.apache.hudi.utils.TestData) CollectionUtil(org.apache.flink.util.CollectionUtil) Arguments(org.junit.jupiter.params.provider.Arguments) Collectors(java.util.stream.Collectors) JobClient(org.apache.flink.core.execution.JobClient) File(java.io.File) Test(org.junit.jupiter.api.Test) Objects(java.util.Objects) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) Stream(java.util.stream.Stream) TempDir(org.junit.jupiter.api.io.TempDir) TestConfigurations.sql(org.apache.hudi.utils.TestConfigurations.sql) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) EnvironmentSettings(org.apache.flink.table.api.EnvironmentSettings) TableResult(org.apache.flink.table.api.TableResult) TestSQL(org.apache.hudi.utils.TestSQL) Row(org.apache.flink.types.Row) ExecutionConfigOptions(org.apache.flink.table.api.config.ExecutionConfigOptions) FlinkOptions(org.apache.hudi.configuration.FlinkOptions) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Configuration(org.apache.flink.configuration.Configuration) HashMap(java.util.HashMap) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 50 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class HoodieCopyOnWriteTableInputFormat method listStatusForIncrementalMode.

/**
 * Achieves listStatus functionality for an incrementally queried table. Instead of listing all
 * partitions and then filtering based on the commits of interest, this logic first extracts the
 * partitions touched by the desired commits and then lists only those partitions.
 */
protected List<FileStatus> listStatusForIncrementalMode(JobConf job, HoodieTableMetaClient tableMetaClient, List<Path> inputPaths, String incrementalTable) throws IOException {
    Job jobContext = Job.getInstance(job);
    Option<HoodieTimeline> timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient);
    if (!timeline.isPresent()) {
        return null;
    }
    Option<List<HoodieInstant>> commitsToCheck = HoodieInputFormatUtils.getCommitsForIncrementalQuery(jobContext, incrementalTable, timeline.get());
    if (!commitsToCheck.isPresent()) {
        return null;
    }
    Option<String> incrementalInputPaths = HoodieInputFormatUtils.getAffectedPartitions(commitsToCheck.get(), tableMetaClient, timeline.get(), inputPaths);
    // Mutate the JobConf to set the input paths to only partitions touched by incremental pull.
    if (!incrementalInputPaths.isPresent()) {
        return null;
    }
    setInputPaths(job, incrementalInputPaths.get());
    FileStatus[] fileStatuses = doListStatus(job);
    return HoodieInputFormatUtils.filterIncrementalFileStatus(jobContext, tableMetaClient, timeline.get(), fileStatuses, commitsToCheck.get());
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job)

Aggregations

HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)118 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)74 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)39 List (java.util.List)36 IOException (java.io.IOException)34 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)34 ArrayList (java.util.ArrayList)32 Option (org.apache.hudi.common.util.Option)30 Collectors (java.util.stream.Collectors)29 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)29 HoodieException (org.apache.hudi.exception.HoodieException)26 Map (java.util.Map)25 FileStatus (org.apache.hadoop.fs.FileStatus)24 Path (org.apache.hadoop.fs.Path)24 Set (java.util.Set)22 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)22 FileSlice (org.apache.hudi.common.model.FileSlice)21 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)21 Pair (org.apache.hudi.common.util.collection.Pair)21 FSUtils (org.apache.hudi.common.fs.FSUtils)20