Search in sources :

Example 31 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class UpsertPartitioner method getSmallFilesForPartitions.

private Map<String, List<SmallFile>> getSmallFilesForPartitions(List<String> partitionPaths, HoodieEngineContext context) {
    JavaSparkContext jsc = HoodieSparkEngineContext.getSparkContext(context);
    Map<String, List<SmallFile>> partitionSmallFilesMap = new HashMap<>();
    if (config.getParquetSmallFileLimit() <= 0) {
        return partitionSmallFilesMap;
    }
    if (partitionPaths != null && partitionPaths.size() > 0) {
        context.setJobStatus(this.getClass().getSimpleName(), "Getting small files from partitions");
        JavaRDD<String> partitionPathRdds = jsc.parallelize(partitionPaths, partitionPaths.size());
        partitionSmallFilesMap = partitionPathRdds.mapToPair((PairFunction<String, String, List<SmallFile>>) partitionPath -> new Tuple2<>(partitionPath, getSmallFiles(partitionPath))).collectAsMap();
    }
    return partitionSmallFilesMap;
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) Map(java.util.Map) WorkloadProfile(org.apache.hudi.table.WorkloadProfile) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) WorkloadStat(org.apache.hudi.table.WorkloadStat) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Entry(java.util.Map.Entry) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) PairFunction(org.apache.spark.api.java.function.PairFunction) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) NumericUtils(org.apache.hudi.common.util.NumericUtils) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) List(java.util.List) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 32 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class FSUtils method parallelizeFilesProcess.

public static <T> Map<String, T> parallelizeFilesProcess(HoodieEngineContext hoodieEngineContext, FileSystem fs, int parallelism, SerializableFunction<Pair<String, SerializableConfiguration>, T> pairFunction, List<String> subPaths) {
    Map<String, T> result = new HashMap<>();
    if (subPaths.size() > 0) {
        SerializableConfiguration conf = new SerializableConfiguration(fs.getConf());
        int actualParallelism = Math.min(subPaths.size(), parallelism);
        result = hoodieEngineContext.mapToPair(subPaths, subPath -> new ImmutablePair<>(subPath, pairFunction.apply(new ImmutablePair<>(subPath, conf))), actualParallelism);
    }
    return result;
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Arrays(java.util.Arrays) InvalidHoodiePathException(org.apache.hudi.exception.InvalidHoodiePathException) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) PathFilter(org.apache.hadoop.fs.PathFilter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Matcher(java.util.regex.Matcher) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) Predicate(java.util.function.Predicate) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Set(java.util.Set) IOException(java.io.IOException) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) Serializable(java.io.Serializable) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Entry(java.util.Map.Entry) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Pair(org.apache.hudi.common.util.collection.Pair) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HashMap(java.util.HashMap) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration)

Example 33 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class FSUtils method parallelizeSubPathProcess.

/**
 * Processes sub-path in parallel.
 *
 * @param hoodieEngineContext {@code HoodieEngineContext} instance
 * @param fs file system
 * @param dirPath directory path
 * @param parallelism parallelism to use for sub-paths
 * @param subPathPredicate predicate to use to filter sub-paths for processing
 * @param pairFunction actual processing logic for each sub-path
 * @param <T> type of result to return for each sub-path
 * @return a map of sub-path to result of the processing
 */
public static <T> Map<String, T> parallelizeSubPathProcess(HoodieEngineContext hoodieEngineContext, FileSystem fs, Path dirPath, int parallelism, Predicate<FileStatus> subPathPredicate, SerializableFunction<Pair<String, SerializableConfiguration>, T> pairFunction) {
    Map<String, T> result = new HashMap<>();
    try {
        FileStatus[] fileStatuses = fs.listStatus(dirPath);
        List<String> subPaths = Arrays.stream(fileStatuses).filter(subPathPredicate).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
        result = parallelizeFilesProcess(hoodieEngineContext, fs, parallelism, pairFunction, subPaths);
    } catch (IOException ioe) {
        throw new HoodieIOException(ioe.getMessage(), ioe);
    }
    return result;
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Arrays(java.util.Arrays) InvalidHoodiePathException(org.apache.hudi.exception.InvalidHoodiePathException) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) PathFilter(org.apache.hadoop.fs.PathFilter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Matcher(java.util.regex.Matcher) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) Predicate(java.util.function.Predicate) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Set(java.util.Set) IOException(java.io.IOException) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) Serializable(java.io.Serializable) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Entry(java.util.Map.Entry) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Pair(org.apache.hudi.common.util.collection.Pair) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HashMap(java.util.HashMap) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 34 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class SparkMain method deleteMarker.

protected static int deleteMarker(JavaSparkContext jsc, String instantTime, String basePath) {
    try {
        SparkRDDWriteClient client = createHoodieClient(jsc, basePath);
        HoodieWriteConfig config = client.getConfig();
        HoodieEngineContext context = client.getEngineContext();
        HoodieSparkTable table = HoodieSparkTable.create(config, context, true);
        WriteMarkersFactory.get(config.getMarkersType(), table, instantTime).quietDeleteMarkerDir(context, config.getMarkersDeleteParallelism());
        return 0;
    } catch (Exception e) {
        LOG.warn(String.format("Failed: Could not clean marker instantTime: \"%s\".", instantTime), e);
        return -1;
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieSavepointException(org.apache.hudi.exception.HoodieSavepointException) IOException(java.io.IOException) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable)

Example 35 with HoodieEngineContext

use of org.apache.hudi.common.engine.HoodieEngineContext in project hudi by apache.

the class HoodieSparkCopyOnWriteTable method updateColumnsStatsIndex.

private void updateColumnsStatsIndex(@Nonnull HoodieEngineContext context, @Nonnull List<HoodieWriteStat> updatedFilesStats, @Nonnull String instantTime) throws Exception {
    String sortColsList = config.getClusteringSortColumns();
    String basePath = metaClient.getBasePath();
    String indexPath = metaClient.getColumnStatsIndexPath();
    List<String> touchedFiles = updatedFilesStats.stream().map(s -> new Path(basePath, s.getPath()).toString()).collect(Collectors.toList());
    if (touchedFiles.isEmpty() || StringUtils.isNullOrEmpty(sortColsList) || StringUtils.isNullOrEmpty(indexPath)) {
        return;
    }
    LOG.info(String.format("Updating column-statistics index table (%s)", indexPath));
    List<String> sortCols = Arrays.stream(sortColsList.split(",")).map(String::trim).collect(Collectors.toList());
    HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) context;
    // Fetch table schema to appropriately construct col-stats index schema
    Schema tableWriteSchema = HoodieAvroUtils.createHoodieWriteSchema(new TableSchemaResolver(metaClient).getTableAvroSchemaWithoutMetadataFields());
    List<String> completedCommits = metaClient.getCommitsTimeline().filterCompletedInstants().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
    ColumnStatsIndexHelper.updateColumnStatsIndexFor(sparkEngineContext.getSqlContext().sparkSession(), AvroConversionUtils.convertAvroSchemaToStructType(tableWriteSchema), touchedFiles, sortCols, indexPath, instantTime, completedCommits);
    LOG.info(String.format("Successfully updated column-statistics index at instant (%s)", instantTime));
}
Also used : SparkDeletePartitionCommitActionExecutor(org.apache.hudi.table.action.commit.SparkDeletePartitionCommitActionExecutor) Arrays(java.util.Arrays) SparkExecuteClusteringCommitActionExecutor(org.apache.hudi.table.action.cluster.SparkExecuteClusteringCommitActionExecutor) HoodieRestorePlan(org.apache.hudi.avro.model.HoodieRestorePlan) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) SavepointActionExecutor(org.apache.hudi.table.action.savepoint.SavepointActionExecutor) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) HoodieSavepointMetadata(org.apache.hudi.avro.model.HoodieSavepointMetadata) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) HoodieSortedMergeHandle(org.apache.hudi.io.HoodieSortedMergeHandle) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) ColumnStatsIndexHelper(org.apache.hudi.index.columnstats.ColumnStatsIndexHelper) RestorePlanActionExecutor(org.apache.hudi.table.action.rollback.RestorePlanActionExecutor) Schema(org.apache.avro.Schema) SparkUpsertPreppedCommitActionExecutor(org.apache.hudi.table.action.commit.SparkUpsertPreppedCommitActionExecutor) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) HoodieCreateHandle(org.apache.hudi.io.HoodieCreateHandle) CleanActionExecutor(org.apache.hudi.table.action.clean.CleanActionExecutor) HoodieRollbackPlan(org.apache.hudi.avro.model.HoodieRollbackPlan) Collectors(java.util.stream.Collectors) BaseRollbackPlanActionExecutor(org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) SparkBulkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) CopyOnWriteRollbackActionExecutor(org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor) HoodieBootstrapWriteMetadata(org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HoodieMergeHelper(org.apache.hudi.table.action.commit.HoodieMergeHelper) SparkBulkInsertPreppedCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertPreppedCommitActionExecutor) AvroConversionUtils(org.apache.hudi.AvroConversionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) StringUtils(org.apache.hudi.common.util.StringUtils) SparkInsertPreppedCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertPreppedCommitActionExecutor) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) CleanPlanActionExecutor(org.apache.hudi.table.action.clean.CleanPlanActionExecutor) SparkUpsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor) SparkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertCommitActionExecutor) Nonnull(javax.annotation.Nonnull) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ClusteringPlanActionExecutor(org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) SparkDeleteCommitActionExecutor(org.apache.hudi.table.action.commit.SparkDeleteCommitActionExecutor) IOException(java.io.IOException) SparkBootstrapCommitActionExecutor(org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor) SparkInsertOverwriteCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertOverwriteCommitActionExecutor) CopyOnWriteRestoreActionExecutor(org.apache.hudi.table.action.restore.CopyOnWriteRestoreActionExecutor) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) SparkInsertOverwriteTableCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertOverwriteTableCommitActionExecutor) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver)

Aggregations

HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)36 List (java.util.List)29 ArrayList (java.util.ArrayList)27 IOException (java.io.IOException)25 LogManager (org.apache.log4j.LogManager)25 Logger (org.apache.log4j.Logger)25 Map (java.util.Map)23 Collectors (java.util.stream.Collectors)23 Path (org.apache.hadoop.fs.Path)23 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)23 Option (org.apache.hudi.common.util.Option)23 FileSystem (org.apache.hadoop.fs.FileSystem)21 Pair (org.apache.hudi.common.util.collection.Pair)19 FSUtils (org.apache.hudi.common.fs.FSUtils)18 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)18 HoodieIOException (org.apache.hudi.exception.HoodieIOException)18 HashMap (java.util.HashMap)16 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)16 HoodieTable (org.apache.hudi.table.HoodieTable)15 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)14