Search in sources :

Example 21 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class CommitUtils method buildMetadataFromStats.

private static HoodieCommitMetadata buildMetadataFromStats(List<HoodieWriteStat> writeStats, Map<String, List<String>> partitionToReplaceFileIds, String commitActionType, WriteOperationType operationType) {
    final HoodieCommitMetadata commitMetadata;
    if (commitActionType == HoodieTimeline.REPLACE_COMMIT_ACTION) {
        HoodieReplaceCommitMetadata replaceMetadata = new HoodieReplaceCommitMetadata();
        replaceMetadata.setPartitionToReplaceFileIds(partitionToReplaceFileIds);
        commitMetadata = replaceMetadata;
    } else {
        commitMetadata = new HoodieCommitMetadata();
    }
    for (HoodieWriteStat writeStat : writeStats) {
        String partition = writeStat.getPartitionPath();
        commitMetadata.addWriteStat(partition, writeStat);
    }
    LOG.info("Creating  metadata for " + operationType + " numWriteStats:" + writeStats.size() + "numReplaceFileIds:" + partitionToReplaceFileIds.values().stream().mapToInt(e -> e.size()).sum());
    return commitMetadata;
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Schema(org.apache.avro.Schema) HoodieException(org.apache.hudi.exception.HoodieException) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HashMap(java.util.HashMap) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Map(java.util.Map) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) LogManager(org.apache.log4j.LogManager) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata)

Example 22 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class IncrementalInputSplits method inputSplits.

/**
 * Returns the incremental input splits.
 *
 * @param metaClient    The meta client
 * @param hadoopConf    The hadoop configuration
 * @param issuedInstant The last issued instant, only valid in streaming read
 * @return The list of incremental input splits or empty if there are no new instants
 */
public Result inputSplits(HoodieTableMetaClient metaClient, org.apache.hadoop.conf.Configuration hadoopConf, String issuedInstant) {
    metaClient.reloadActiveTimeline();
    HoodieTimeline commitTimeline = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants();
    if (commitTimeline.empty()) {
        LOG.warn("No splits found for the table under path " + path);
        return Result.EMPTY;
    }
    List<HoodieInstant> instants = filterInstantsWithRange(commitTimeline, issuedInstant);
    // get the latest instant that satisfies condition
    final HoodieInstant instantToIssue = instants.size() == 0 ? null : instants.get(instants.size() - 1);
    final InstantRange instantRange;
    if (instantToIssue != null) {
        if (issuedInstant != null) {
            // the streaming reader may record the last issued instant, if the issued instant is present,
            // the instant range should be: (issued instant, the latest instant].
            instantRange = InstantRange.getInstance(issuedInstant, instantToIssue.getTimestamp(), InstantRange.RangeType.OPEN_CLOSE);
        } else if (this.conf.getOptional(FlinkOptions.READ_START_COMMIT).isPresent()) {
            // first time consume and has a start commit
            final String startCommit = this.conf.getString(FlinkOptions.READ_START_COMMIT);
            instantRange = startCommit.equalsIgnoreCase(FlinkOptions.START_COMMIT_EARLIEST) ? null : InstantRange.getInstance(startCommit, instantToIssue.getTimestamp(), InstantRange.RangeType.CLOSE_CLOSE);
        } else {
            // first time consume and no start commit, consumes the latest incremental data set.
            instantRange = InstantRange.getInstance(instantToIssue.getTimestamp(), instantToIssue.getTimestamp(), InstantRange.RangeType.CLOSE_CLOSE);
        }
    } else {
        LOG.info("No new instant found for the table under path " + path + ", skip reading");
        return Result.EMPTY;
    }
    String tableName = conf.getString(FlinkOptions.TABLE_NAME);
    Set<String> writePartitions;
    final FileStatus[] fileStatuses;
    if (instantRange == null) {
        // reading from the earliest, scans the partitions and files directly.
        FileIndex fileIndex = FileIndex.instance(new org.apache.hadoop.fs.Path(path.toUri()), conf);
        if (this.requiredPartitions != null) {
            // apply partition push down
            fileIndex.setPartitionPaths(this.requiredPartitions);
        }
        writePartitions = new HashSet<>(fileIndex.getOrBuildPartitionPaths());
        if (writePartitions.size() == 0) {
            LOG.warn("No partitions found for reading in user provided path.");
            return Result.EMPTY;
        }
        fileStatuses = fileIndex.getFilesInPartitions();
    } else {
        List<HoodieCommitMetadata> activeMetadataList = instants.stream().map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, commitTimeline)).collect(Collectors.toList());
        List<HoodieCommitMetadata> archivedMetadataList = getArchivedMetadata(metaClient, instantRange, commitTimeline, tableName);
        if (archivedMetadataList.size() > 0) {
            LOG.warn("\n" + "--------------------------------------------------------------------------------\n" + "---------- caution: the reader has fall behind too much from the writer,\n" + "---------- tweak 'read.tasks' option to add parallelism of read tasks.\n" + "--------------------------------------------------------------------------------");
        }
        List<HoodieCommitMetadata> metadataList = archivedMetadataList.size() > 0 ? // IMPORTANT: the merged metadata list must be in ascending order by instant time
        mergeList(archivedMetadataList, activeMetadataList) : activeMetadataList;
        writePartitions = HoodieInputFormatUtils.getWritePartitionPaths(metadataList);
        // apply partition push down
        if (this.requiredPartitions != null) {
            writePartitions = writePartitions.stream().filter(this.requiredPartitions::contains).collect(Collectors.toSet());
        }
        if (writePartitions.size() == 0) {
            LOG.warn("No partitions found for reading in user provided path.");
            return Result.EMPTY;
        }
        fileStatuses = WriteProfiles.getWritePathsOfInstants(path, hadoopConf, metadataList, metaClient.getTableType());
    }
    if (fileStatuses.length == 0) {
        LOG.warn("No files found for reading in user provided path.");
        return Result.EMPTY;
    }
    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, commitTimeline, fileStatuses);
    final String endInstant = instantToIssue.getTimestamp();
    final AtomicInteger cnt = new AtomicInteger(0);
    final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE);
    List<MergeOnReadInputSplit> inputSplits = writePartitions.stream().map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, endInstant).map(fileSlice -> {
        Option<List<String>> logPaths = Option.ofNullable(fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()));
        String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null);
        return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, endInstant, metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, instantRange);
    }).collect(Collectors.toList())).flatMap(Collection::stream).collect(Collectors.toList());
    return Result.instance(inputSplits, endInstant);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieInputFormatUtils) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) Serializable(scala.Serializable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BaseFile(org.apache.hudi.common.model.BaseFile) Path(org.apache.flink.core.fs.Path) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Nullable(javax.annotation.Nullable) Logger(org.slf4j.Logger) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) LESSER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Objects(java.util.Objects) WriteProfiles(org.apache.hudi.sink.partitioner.profile.WriteProfiles) List(java.util.List) GREATER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS) Stream(java.util.stream.Stream) InstantRange(org.apache.hudi.common.table.log.InstantRange) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) Collections(java.util.Collections) FlinkOptions(org.apache.hudi.configuration.FlinkOptions) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) InstantRange(org.apache.hudi.common.table.log.InstantRange) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ArrayList(java.util.ArrayList) List(java.util.List) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 23 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class CompactionCommitSink method doCommit.

@SuppressWarnings("unchecked")
private void doCommit(String instant, Collection<CompactionCommitEvent> events) throws IOException {
    List<WriteStatus> statuses = events.stream().map(CompactionCommitEvent::getWriteStatuses).flatMap(Collection::stream).collect(Collectors.toList());
    HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata(table, instant, HoodieList.of(statuses), writeClient.getConfig().getSchema());
    // commit the compaction
    this.writeClient.commitCompaction(instant, metadata, Option.empty());
    // Whether to clean up the old log file when compaction
    if (!conf.getBoolean(FlinkOptions.CLEAN_ASYNC_ENABLED)) {
        this.writeClient.clean();
    }
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 24 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class HiveTestUtil method createMORTable.

public static void createMORTable(String commitTime, String deltaCommitTime, int numberOfPartitions, boolean createDeltaCommit, boolean useSchemaFromCommitMetadata) throws IOException, URISyntaxException, InterruptedException {
    Path path = new Path(hiveSyncConfig.basePath);
    FileIOUtils.deleteDirectory(new File(hiveSyncConfig.basePath));
    HoodieTableMetaClient.withPropertyBuilder().setTableType(HoodieTableType.MERGE_ON_READ).setTableName(hiveSyncConfig.tableName).setPayloadClass(HoodieAvroPayload.class).initTable(configuration, hiveSyncConfig.basePath);
    boolean result = fileSystem.mkdirs(path);
    checkResult(result);
    ZonedDateTime dateTime = ZonedDateTime.now();
    HoodieCommitMetadata commitMetadata = createPartitions(numberOfPartitions, true, useSchemaFromCommitMetadata, dateTime, commitTime, hiveSyncConfig.basePath);
    createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_READ_OPTIMIZED_TABLE);
    createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName + HiveSyncTool.SUFFIX_SNAPSHOT_TABLE);
    HoodieCommitMetadata compactionMetadata = new HoodieCommitMetadata();
    commitMetadata.getPartitionToWriteStats().forEach((key, value) -> value.forEach(l -> compactionMetadata.addWriteStat(key, l)));
    addSchemaToCommitMetadata(compactionMetadata, commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY), useSchemaFromCommitMetadata);
    createCompactionCommitFile(compactionMetadata, commitTime);
    if (createDeltaCommit) {
        // Write a delta commit
        HoodieCommitMetadata deltaMetadata = createLogFiles(commitMetadata.getPartitionToWriteStats(), true, useSchemaFromCommitMetadata);
        createDeltaCommitFile(deltaMetadata, deltaCommitTime);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) FileSystem(org.apache.hadoop.fs.FileSystem) URISyntaxException(java.net.URISyntaxException) ZonedDateTime(java.time.ZonedDateTime) HiveSyncTool(org.apache.hudi.hive.HiveSyncTool) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SchemaTestUtil(org.apache.hudi.common.testutils.SchemaTestUtil) Path(org.apache.hadoop.fs.Path) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) Schema(org.apache.avro.Schema) BloomFilterFactory(org.apache.hudi.common.bloom.BloomFilterFactory) JUnitException(org.junit.platform.commons.JUnitException) Set(java.util.Set) UUID(java.util.UUID) Instant(java.time.Instant) StandardCharsets(java.nio.charset.StandardCharsets) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) ParquetWriter(org.apache.parquet.hadoop.ParquetWriter) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Entry(java.util.Map.Entry) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) CompressionCodecName(org.apache.parquet.hadoop.metadata.CompressionCodecName) HoodieLogBlock(org.apache.hudi.common.table.log.block.HoodieLogBlock) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload) Assertions.fail(org.junit.jupiter.api.Assertions.fail) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HeaderMetadataType(org.apache.hudi.common.table.log.block.HoodieLogBlock.HeaderMetadataType) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Writer(org.apache.hudi.common.table.log.HoodieLogFormat.Writer) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HiveQueryDDLExecutor(org.apache.hudi.hive.ddl.HiveQueryDDLExecutor) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Files(java.nio.file.Files) BloomFilterTypeCode(org.apache.hudi.common.bloom.BloomFilterTypeCode) HiveSyncConfig(org.apache.hudi.hive.HiveSyncConfig) HiveConf(org.apache.hadoop.hive.conf.HiveConf) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) ZookeeperTestService(org.apache.hudi.common.testutils.minicluster.ZookeeperTestService) File(java.io.File) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) ZooKeeperServer(org.apache.zookeeper.server.ZooKeeperServer) ChronoUnit(java.time.temporal.ChronoUnit) HoodieAvroDataBlock(org.apache.hudi.common.table.log.block.HoodieAvroDataBlock) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) DateTimeFormatter(java.time.format.DateTimeFormatter) HoodieAvroWriteSupport(org.apache.hudi.avro.HoodieAvroWriteSupport) QueryBasedDDLExecutor(org.apache.hudi.hive.ddl.QueryBasedDDLExecutor) HiveServer2(org.apache.hive.service.server.HiveServer2) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) ZonedDateTime(java.time.ZonedDateTime) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload)

Example 25 with HoodieCommitMetadata

use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.

the class HiveTestUtil method addCOWPartition.

public static void addCOWPartition(String partitionPath, boolean isParquetSchemaSimple, boolean useSchemaFromCommitMetadata, String instantTime) throws IOException, URISyntaxException {
    HoodieCommitMetadata commitMetadata = createPartition(partitionPath, isParquetSchemaSimple, useSchemaFromCommitMetadata, instantTime);
    createdTablesSet.add(hiveSyncConfig.databaseName + "." + hiveSyncConfig.tableName);
    createCommitFile(commitMetadata, instantTime, hiveSyncConfig.basePath);
}
Also used : HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata)

Aggregations

HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)139 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)64 ArrayList (java.util.ArrayList)54 HashMap (java.util.HashMap)49 List (java.util.List)48 HoodieWriteStat (org.apache.hudi.common.model.HoodieWriteStat)44 IOException (java.io.IOException)42 Test (org.junit.jupiter.api.Test)41 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)40 Map (java.util.Map)38 Path (org.apache.hadoop.fs.Path)36 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)34 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)34 File (java.io.File)26 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)26 Option (org.apache.hudi.common.util.Option)25 Schema (org.apache.avro.Schema)22 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)21 Collectors (java.util.stream.Collectors)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20