Search in sources :

Example 26 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class SparkBootstrapCommitActionExecutor method runMetadataBootstrap.

private HoodieData<BootstrapWriteStatus> runMetadataBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitions) {
    if (null == partitions || partitions.isEmpty()) {
        return context.emptyHoodieData();
    }
    TypedProperties properties = new TypedProperties();
    properties.putAll(config.getProps());
    KeyGeneratorInterface keyGenerator;
    try {
        keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(properties);
    } catch (IOException e) {
        throw new HoodieKeyGeneratorException("Init keyGenerator failed ", e);
    }
    BootstrapPartitionPathTranslator translator = (BootstrapPartitionPathTranslator) ReflectionUtils.loadClass(config.getBootstrapPartitionPathTranslatorClass(), properties);
    List<Pair<String, Pair<String, HoodieFileStatus>>> bootstrapPaths = partitions.stream().flatMap(p -> {
        String translatedPartitionPath = translator.getBootstrapTranslatedPath(p.getKey());
        return p.getValue().stream().map(f -> Pair.of(p.getKey(), Pair.of(translatedPartitionPath, f)));
    }).collect(Collectors.toList());
    context.setJobStatus(this.getClass().getSimpleName(), "Bootstrap metadata table.");
    return context.parallelize(bootstrapPaths, config.getBootstrapParallelism()).map(partitionFsPair -> getMetadataHandler(config, table, partitionFsPair.getRight().getRight()).runMetadataBootstrap(partitionFsPair.getLeft(), partitionFsPair.getRight().getLeft(), keyGenerator));
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BaseCommitActionExecutor(org.apache.hudi.table.action.commit.BaseCommitActionExecutor) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) Logger(org.apache.log4j.Logger) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Duration(java.time.Duration) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) BootstrapPartitionPathTranslator(org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator) Collection(java.util.Collection) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) List(java.util.List) WRITE_STATUS_STORAGE_LEVEL_VALUE(org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) BootstrapWriteStatus(org.apache.hudi.client.bootstrap.BootstrapWriteStatus) SparkBulkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) MetadataBootstrapHandlerFactory.getMetadataHandler(org.apache.hudi.table.action.bootstrap.MetadataBootstrapHandlerFactory.getMetadataHandler) HoodieBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HoodieKeyGeneratorException(org.apache.hudi.exception.HoodieKeyGeneratorException) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) BaseSparkCommitActionExecutor(org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) HoodieData(org.apache.hudi.common.data.HoodieData) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieSparkBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieSparkBootstrapSchemaProvider) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkValidatorUtils(org.apache.hudi.client.utils.SparkValidatorUtils) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) HoodieKeyGeneratorException(org.apache.hudi.exception.HoodieKeyGeneratorException) BootstrapPartitionPathTranslator(org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) TypedProperties(org.apache.hudi.common.config.TypedProperties) Pair(org.apache.hudi.common.util.collection.Pair)

Example 27 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class SparkBootstrapCommitActionExecutor method metadataBootstrap.

/**
 * Perform Metadata Bootstrap.
 * @param partitionFilesList List of partitions and files within that partitions
 */
protected Option<HoodieWriteMetadata<HoodieData<WriteStatus>>> metadataBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitionFilesList) {
    if (null == partitionFilesList || partitionFilesList.isEmpty()) {
        return Option.empty();
    }
    HoodieTableMetaClient metaClient = table.getMetaClient();
    metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(State.REQUESTED, metaClient.getCommitActionType(), HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS));
    table.getActiveTimeline().transitionRequestedToInflight(new HoodieInstant(State.REQUESTED, metaClient.getCommitActionType(), HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS), Option.empty());
    HoodieData<BootstrapWriteStatus> bootstrapWriteStatuses = runMetadataBootstrap(partitionFilesList);
    HoodieWriteMetadata<HoodieData<WriteStatus>> result = new HoodieWriteMetadata<>();
    updateIndexAndCommitIfNeeded(bootstrapWriteStatuses.map(w -> w), result);
    return Option.of(result);
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieTable(org.apache.hudi.table.HoodieTable) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BaseCommitActionExecutor(org.apache.hudi.table.action.commit.BaseCommitActionExecutor) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) Logger(org.apache.log4j.Logger) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Duration(java.time.Duration) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) BootstrapPartitionPathTranslator(org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator) Collection(java.util.Collection) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) List(java.util.List) WRITE_STATUS_STORAGE_LEVEL_VALUE(org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) BootstrapWriteStatus(org.apache.hudi.client.bootstrap.BootstrapWriteStatus) SparkBulkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) MetadataBootstrapHandlerFactory.getMetadataHandler(org.apache.hudi.table.action.bootstrap.MetadataBootstrapHandlerFactory.getMetadataHandler) HoodieBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HoodieKeyGeneratorException(org.apache.hudi.exception.HoodieKeyGeneratorException) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) BaseSparkCommitActionExecutor(org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) HoodieData(org.apache.hudi.common.data.HoodieData) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieSparkBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieSparkBootstrapSchemaProvider) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkValidatorUtils(org.apache.hudi.client.utils.SparkValidatorUtils) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) BootstrapWriteStatus(org.apache.hudi.client.bootstrap.BootstrapWriteStatus) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata)

Example 28 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class SparkBootstrapCommitActionExecutor method commit.

@Override
protected void commit(Option<Map<String, String>> extraMetadata, HoodieWriteMetadata<HoodieData<WriteStatus>> result) {
    // Perform bootstrap index write and then commit. Make sure both record-key and bootstrap-index
    // is all done in a single job DAG.
    Map<String, List<Pair<BootstrapFileMapping, HoodieWriteStat>>> bootstrapSourceAndStats = result.getWriteStatuses().collectAsList().stream().map(w -> {
        BootstrapWriteStatus ws = (BootstrapWriteStatus) w;
        return Pair.of(ws.getBootstrapSourceFileMapping(), ws.getStat());
    }).collect(Collectors.groupingBy(w -> w.getKey().getPartitionPath()));
    HoodieTableMetaClient metaClient = table.getMetaClient();
    try (BootstrapIndex.IndexWriter indexWriter = BootstrapIndex.getBootstrapIndex(metaClient).createWriter(metaClient.getTableConfig().getBootstrapBasePath().get())) {
        LOG.info("Starting to write bootstrap index for source " + config.getBootstrapSourceBasePath() + " in table " + config.getBasePath());
        indexWriter.begin();
        bootstrapSourceAndStats.forEach((key, value) -> indexWriter.appendNextPartition(key, value.stream().map(Pair::getKey).collect(Collectors.toList())));
        indexWriter.finish();
        LOG.info("Finished writing bootstrap index for source " + config.getBootstrapSourceBasePath() + " in table " + config.getBasePath());
    }
    commit(extraMetadata, result, bootstrapSourceAndStats.values().stream().flatMap(f -> f.stream().map(Pair::getValue)).collect(Collectors.toList()));
    LOG.info("Committing metadata bootstrap !!");
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BaseCommitActionExecutor(org.apache.hudi.table.action.commit.BaseCommitActionExecutor) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) Logger(org.apache.log4j.Logger) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Duration(java.time.Duration) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) BootstrapPartitionPathTranslator(org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator) Collection(java.util.Collection) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) List(java.util.List) WRITE_STATUS_STORAGE_LEVEL_VALUE(org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) BootstrapWriteStatus(org.apache.hudi.client.bootstrap.BootstrapWriteStatus) SparkBulkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) MetadataBootstrapHandlerFactory.getMetadataHandler(org.apache.hudi.table.action.bootstrap.MetadataBootstrapHandlerFactory.getMetadataHandler) HoodieBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HoodieKeyGeneratorException(org.apache.hudi.exception.HoodieKeyGeneratorException) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) BaseSparkCommitActionExecutor(org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) HoodieData(org.apache.hudi.common.data.HoodieData) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieSparkBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieSparkBootstrapSchemaProvider) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkValidatorUtils(org.apache.hudi.client.utils.SparkValidatorUtils) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) BootstrapWriteStatus(org.apache.hudi.client.bootstrap.BootstrapWriteStatus) List(java.util.List) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping)

Example 29 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class MultipleSparkJobExecutionStrategy method readRecordsForGroupBaseFiles.

/**
 * Read records from baseFiles and convert to RDD.
 */
private HoodieData<HoodieRecord<T>> readRecordsForGroupBaseFiles(JavaSparkContext jsc, List<ClusteringOperation> clusteringOps) {
    SerializableConfiguration hadoopConf = new SerializableConfiguration(getHoodieTable().getHadoopConf());
    HoodieWriteConfig writeConfig = getWriteConfig();
    // closure, as this might lead to issues attempting to serialize its nested fields
    return HoodieJavaRDD.of(jsc.parallelize(clusteringOps, clusteringOps.size()).mapPartitions(clusteringOpsPartition -> {
        List<Iterator<IndexedRecord>> iteratorsForPartition = new ArrayList<>();
        clusteringOpsPartition.forEachRemaining(clusteringOp -> {
            try {
                Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(writeConfig.getSchema()));
                HoodieFileReader<IndexedRecord> baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(clusteringOp.getDataFilePath()));
                iteratorsForPartition.add(baseFileReader.getRecordIterator(readerSchema));
            } catch (IOException e) {
                throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + " and " + clusteringOp.getDeltaFilePaths(), e);
            }
        });
        return new ConcatenatingIterator<>(iteratorsForPartition);
    }).map(record -> transform(record, writeConfig)));
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) RewriteAvroPayload(org.apache.hudi.common.model.RewriteAvroPayload) ConcatenatingIterator(org.apache.hudi.client.utils.ConcatenatingIterator) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Logger(org.apache.log4j.Logger) RDDSpatialCurveSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDSpatialCurveSortPartitioner) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) PLAN_STRATEGY_SORT_COLUMNS(org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) ClusteringOperation(org.apache.hudi.common.model.ClusteringOperation) Collectors(java.util.stream.Collectors) List(java.util.List) Stream(java.util.stream.Stream) KeyGenUtils(org.apache.hudi.keygen.KeyGenUtils) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) CompletableFuture(java.util.concurrent.CompletableFuture) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieFileSliceReader.getFileSliceReader(org.apache.hudi.common.table.log.HoodieFileSliceReader.getFileSliceReader) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) ClusteringExecutionStrategy(org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy) IndexedRecord(org.apache.avro.generic.IndexedRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) RDDCustomColumnsSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) FutureUtils(org.apache.hudi.common.util.FutureUtils) HoodieClusteringException(org.apache.hudi.exception.HoodieClusteringException) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) IOUtils(org.apache.hudi.io.IOUtils) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) IndexedRecord(org.apache.avro.generic.IndexedRecord) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Schema(org.apache.avro.Schema) ConcatenatingIterator(org.apache.hudi.client.utils.ConcatenatingIterator) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieClusteringException(org.apache.hudi.exception.HoodieClusteringException) ConcatenatingIterator(org.apache.hudi.client.utils.ConcatenatingIterator) Iterator(java.util.Iterator)

Example 30 with HoodieData

use of org.apache.hudi.common.data.HoodieData in project hudi by apache.

the class MultipleSparkJobExecutionStrategy method performClustering.

@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) {
    JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext());
    // execute clustering for each group async and collect WriteStatus
    Stream<HoodieData<WriteStatus>> writeStatusesStream = FutureUtils.allOf(clusteringPlan.getInputGroups().stream().map(inputGroup -> runClusteringForGroupAsync(inputGroup, clusteringPlan.getStrategy().getStrategyParams(), Option.ofNullable(clusteringPlan.getPreserveHoodieMetadata()).orElse(false), instantTime)).collect(Collectors.toList())).join().stream();
    JavaRDD<WriteStatus>[] writeStatuses = convertStreamToArray(writeStatusesStream.map(HoodieJavaRDD::getJavaRDD));
    JavaRDD<WriteStatus> writeStatusRDD = engineContext.union(writeStatuses);
    HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = new HoodieWriteMetadata<>();
    writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD));
    return writeMetadata;
}
Also used : HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD)

Aggregations

HoodieData (org.apache.hudi.common.data.HoodieData)36 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)24 WriteStatus (org.apache.hudi.client.WriteStatus)22 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)22 List (java.util.List)21 HoodieTable (org.apache.hudi.table.HoodieTable)20 HoodieKey (org.apache.hudi.common.model.HoodieKey)18 LogManager (org.apache.log4j.LogManager)18 Logger (org.apache.log4j.Logger)18 IOException (java.io.IOException)17 Collectors (java.util.stream.Collectors)17 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)17 Option (org.apache.hudi.common.util.Option)17 Map (java.util.Map)16 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)16 HoodieWriteMetadata (org.apache.hudi.table.action.HoodieWriteMetadata)16 JavaRDD (org.apache.spark.api.java.JavaRDD)16 Pair (org.apache.hudi.common.util.collection.Pair)15 HoodieJavaRDD (org.apache.hudi.data.HoodieJavaRDD)15 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)14