Search in sources :

Example 11 with HoodieFileStatus

use of org.apache.hudi.avro.model.HoodieFileStatus in project hudi by apache.

the class SparkBootstrapCommitActionExecutor method runMetadataBootstrap.

private HoodieData<BootstrapWriteStatus> runMetadataBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitions) {
    if (null == partitions || partitions.isEmpty()) {
        return context.emptyHoodieData();
    }
    TypedProperties properties = new TypedProperties();
    properties.putAll(config.getProps());
    KeyGeneratorInterface keyGenerator;
    try {
        keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(properties);
    } catch (IOException e) {
        throw new HoodieKeyGeneratorException("Init keyGenerator failed ", e);
    }
    BootstrapPartitionPathTranslator translator = (BootstrapPartitionPathTranslator) ReflectionUtils.loadClass(config.getBootstrapPartitionPathTranslatorClass(), properties);
    List<Pair<String, Pair<String, HoodieFileStatus>>> bootstrapPaths = partitions.stream().flatMap(p -> {
        String translatedPartitionPath = translator.getBootstrapTranslatedPath(p.getKey());
        return p.getValue().stream().map(f -> Pair.of(p.getKey(), Pair.of(translatedPartitionPath, f)));
    }).collect(Collectors.toList());
    context.setJobStatus(this.getClass().getSimpleName(), "Bootstrap metadata table.");
    return context.parallelize(bootstrapPaths, config.getBootstrapParallelism()).map(partitionFsPair -> getMetadataHandler(config, table, partitionFsPair.getRight().getRight()).runMetadataBootstrap(partitionFsPair.getLeft(), partitionFsPair.getRight().getLeft(), keyGenerator));
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BaseCommitActionExecutor(org.apache.hudi.table.action.commit.BaseCommitActionExecutor) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) Logger(org.apache.log4j.Logger) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Duration(java.time.Duration) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) BootstrapPartitionPathTranslator(org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator) Collection(java.util.Collection) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) List(java.util.List) WRITE_STATUS_STORAGE_LEVEL_VALUE(org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) BootstrapWriteStatus(org.apache.hudi.client.bootstrap.BootstrapWriteStatus) SparkBulkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) MetadataBootstrapHandlerFactory.getMetadataHandler(org.apache.hudi.table.action.bootstrap.MetadataBootstrapHandlerFactory.getMetadataHandler) HoodieBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HoodieKeyGeneratorException(org.apache.hudi.exception.HoodieKeyGeneratorException) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) BaseSparkCommitActionExecutor(org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) HoodieData(org.apache.hudi.common.data.HoodieData) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieSparkBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieSparkBootstrapSchemaProvider) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkValidatorUtils(org.apache.hudi.client.utils.SparkValidatorUtils) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) HoodieKeyGeneratorException(org.apache.hudi.exception.HoodieKeyGeneratorException) BootstrapPartitionPathTranslator(org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) TypedProperties(org.apache.hudi.common.config.TypedProperties) Pair(org.apache.hudi.common.util.collection.Pair)

Example 12 with HoodieFileStatus

use of org.apache.hudi.avro.model.HoodieFileStatus in project hudi by apache.

the class SparkBootstrapCommitActionExecutor method listAndProcessSourcePartitions.

/**
 * Return Bootstrap Mode selections for partitions listed and figure out bootstrap Schema.
 * @return
 * @throws IOException
 */
private Map<BootstrapMode, List<Pair<String, List<HoodieFileStatus>>>> listAndProcessSourcePartitions() throws IOException {
    List<Pair<String, List<HoodieFileStatus>>> folders = BootstrapUtils.getAllLeafFoldersWithFiles(table.getMetaClient(), bootstrapSourceFileSystem, config.getBootstrapSourceBasePath(), context);
    LOG.info("Fetching Bootstrap Schema !!");
    HoodieBootstrapSchemaProvider sourceSchemaProvider = new HoodieSparkBootstrapSchemaProvider(config);
    bootstrapSchema = sourceSchemaProvider.getBootstrapSchema(context, folders).toString();
    LOG.info("Bootstrap Schema :" + bootstrapSchema);
    BootstrapModeSelector selector = (BootstrapModeSelector) ReflectionUtils.loadClass(config.getBootstrapModeSelectorClass(), config);
    Map<BootstrapMode, List<String>> result = selector.select(folders);
    Map<String, List<HoodieFileStatus>> partitionToFiles = folders.stream().collect(Collectors.toMap(Pair::getKey, Pair::getValue));
    // Ensure all partitions are accounted for
    ValidationUtils.checkArgument(partitionToFiles.keySet().equals(result.values().stream().flatMap(Collection::stream).collect(Collectors.toSet())));
    return result.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue().stream().map(p -> Pair.of(p, partitionToFiles.get(p))).collect(Collectors.toList()))).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BaseCommitActionExecutor(org.apache.hudi.table.action.commit.BaseCommitActionExecutor) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) Logger(org.apache.log4j.Logger) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Duration(java.time.Duration) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) BootstrapPartitionPathTranslator(org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator) Collection(java.util.Collection) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) List(java.util.List) WRITE_STATUS_STORAGE_LEVEL_VALUE(org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) BootstrapWriteStatus(org.apache.hudi.client.bootstrap.BootstrapWriteStatus) SparkBulkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) MetadataBootstrapHandlerFactory.getMetadataHandler(org.apache.hudi.table.action.bootstrap.MetadataBootstrapHandlerFactory.getMetadataHandler) HoodieBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HoodieKeyGeneratorException(org.apache.hudi.exception.HoodieKeyGeneratorException) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) BaseSparkCommitActionExecutor(org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) HoodieData(org.apache.hudi.common.data.HoodieData) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieSparkBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieSparkBootstrapSchemaProvider) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkValidatorUtils(org.apache.hudi.client.utils.SparkValidatorUtils) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) HoodieSparkBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieSparkBootstrapSchemaProvider) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) List(java.util.List) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider)

Example 13 with HoodieFileStatus

use of org.apache.hudi.avro.model.HoodieFileStatus in project hudi by apache.

the class SparkBootstrapCommitActionExecutor method metadataBootstrap.

/**
 * Perform Metadata Bootstrap.
 * @param partitionFilesList List of partitions and files within that partitions
 */
protected Option<HoodieWriteMetadata<HoodieData<WriteStatus>>> metadataBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitionFilesList) {
    if (null == partitionFilesList || partitionFilesList.isEmpty()) {
        return Option.empty();
    }
    HoodieTableMetaClient metaClient = table.getMetaClient();
    metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(State.REQUESTED, metaClient.getCommitActionType(), HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS));
    table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED, metaClient.getCommitActionType(), HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS), Option.empty());
    HoodieData<BootstrapWriteStatus> bootstrapWriteStatuses = runMetadataBootstrap(partitionFilesList);
    HoodieWriteMetadata<HoodieData<WriteStatus>> result = new HoodieWriteMetadata<>();
    updateIndexAndCommitIfNeeded(bootstrapWriteStatuses.map(w -> w), result);
    return Option.of(result);
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieTable(org.apache.hudi.table.HoodieTable) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BaseCommitActionExecutor(org.apache.hudi.table.action.commit.BaseCommitActionExecutor) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) Logger(org.apache.log4j.Logger) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Duration(java.time.Duration) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) BootstrapPartitionPathTranslator(org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator) Collection(java.util.Collection) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) List(java.util.List) WRITE_STATUS_STORAGE_LEVEL_VALUE(org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) BootstrapWriteStatus(org.apache.hudi.client.bootstrap.BootstrapWriteStatus) SparkBulkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) MetadataBootstrapHandlerFactory.getMetadataHandler(org.apache.hudi.table.action.bootstrap.MetadataBootstrapHandlerFactory.getMetadataHandler) HoodieBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HoodieKeyGeneratorException(org.apache.hudi.exception.HoodieKeyGeneratorException) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) BaseSparkCommitActionExecutor(org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) HoodieData(org.apache.hudi.common.data.HoodieData) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieSparkBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieSparkBootstrapSchemaProvider) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkValidatorUtils(org.apache.hudi.client.utils.SparkValidatorUtils) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) BootstrapWriteStatus(org.apache.hudi.client.bootstrap.BootstrapWriteStatus) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata)

Aggregations

HoodieFileStatus (org.apache.hudi.avro.model.HoodieFileStatus)13 List (java.util.List)12 IOException (java.io.IOException)11 Pair (org.apache.hudi.common.util.collection.Pair)11 Collectors (java.util.stream.Collectors)10 Map (java.util.Map)9 Option (org.apache.hudi.common.util.Option)8 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)8 HoodieIOException (org.apache.hudi.exception.HoodieIOException)8 ArrayList (java.util.ArrayList)7 FullRecordBootstrapDataProvider (org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider)7 HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)7 TypedProperties (org.apache.hudi.common.config.TypedProperties)7 FSUtils (org.apache.hudi.common.fs.FSUtils)7 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)7 JavaRDD (org.apache.spark.api.java.JavaRDD)7 Instant (java.time.Instant)6 Arrays (java.util.Arrays)6 Iterator (java.util.Iterator)6 Path (org.apache.hadoop.fs.Path)6