Search in sources :

Example 1 with ClusteringGroupInfo

use of org.apache.hudi.common.model.ClusteringGroupInfo in project hudi by apache.

the class SingleSparkJobExecutionStrategy method performClustering.

@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> performClustering(final HoodieClusteringPlan clusteringPlan, final Schema schema, final String instantTime) {
    JavaSparkContext engineContext = HoodieSparkEngineContext.getSparkContext(getEngineContext());
    final TaskContextSupplier taskContextSupplier = getEngineContext().getTaskContextSupplier();
    final SerializableSchema serializableSchema = new SerializableSchema(schema);
    final List<ClusteringGroupInfo> clusteringGroupInfos = clusteringPlan.getInputGroups().stream().map(clusteringGroup -> ClusteringGroupInfo.create(clusteringGroup)).collect(Collectors.toList());
    String umask = engineContext.hadoopConfiguration().get("fs.permissions.umask-mode");
    Broadcast<String> umaskBroadcastValue = engineContext.broadcast(umask);
    JavaRDD<ClusteringGroupInfo> groupInfoJavaRDD = engineContext.parallelize(clusteringGroupInfos, clusteringGroupInfos.size());
    LOG.info("number of partitions for clustering " + groupInfoJavaRDD.getNumPartitions());
    JavaRDD<WriteStatus> writeStatusRDD = groupInfoJavaRDD.mapPartitions(clusteringOps -> {
        Configuration configuration = new Configuration();
        configuration.set("fs.permissions.umask-mode", umaskBroadcastValue.getValue());
        Iterable<ClusteringGroupInfo> clusteringOpsIterable = () -> clusteringOps;
        List<ClusteringGroupInfo> groupsInPartition = StreamSupport.stream(clusteringOpsIterable.spliterator(), false).collect(Collectors.toList());
        return groupsInPartition.stream().flatMap(clusteringOp -> runClusteringForGroup(clusteringOp, clusteringPlan.getStrategy().getStrategyParams(), Option.ofNullable(clusteringPlan.getPreserveHoodieMetadata()).orElse(false), serializableSchema, taskContextSupplier, instantTime)).iterator();
    });
    HoodieWriteMetadata<HoodieData<WriteStatus>> writeMetadata = new HoodieWriteMetadata<>();
    writeMetadata.setWriteStatuses(HoodieJavaRDD.of(writeStatusRDD));
    return writeMetadata;
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) KeyGenUtils(org.apache.hudi.keygen.KeyGenUtils) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) RewriteAvroPayload(org.apache.hudi.common.model.RewriteAvroPayload) ConcatenatingIterator(org.apache.hudi.client.utils.ConcatenatingIterator) SerializableSchema(org.apache.hudi.common.config.SerializableSchema) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Logger(org.apache.log4j.Logger) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) StreamSupport(java.util.stream.StreamSupport) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) ClusteringExecutionStrategy(org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy) IndexedRecord(org.apache.avro.generic.IndexedRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) Broadcast(org.apache.spark.broadcast.Broadcast) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieData(org.apache.hudi.common.data.HoodieData) Schema(org.apache.avro.Schema) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) HoodieClusteringException(org.apache.hudi.exception.HoodieClusteringException) ClusteringOperation(org.apache.hudi.common.model.ClusteringOperation) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) WriteStatus(org.apache.hudi.client.WriteStatus) ClusteringGroupInfo(org.apache.hudi.common.model.ClusteringGroupInfo) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) List(java.util.List) Stream(java.util.stream.Stream) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) HoodieData(org.apache.hudi.common.data.HoodieData) Configuration(org.apache.hadoop.conf.Configuration) ClusteringGroupInfo(org.apache.hudi.common.model.ClusteringGroupInfo) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) SerializableSchema(org.apache.hudi.common.config.SerializableSchema) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) WriteStatus(org.apache.hudi.client.WriteStatus)

Aggregations

IOException (java.io.IOException)1 Iterator (java.util.Iterator)1 List (java.util.List)1 Map (java.util.Map)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1 StreamSupport (java.util.stream.StreamSupport)1 Schema (org.apache.avro.Schema)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 IndexedRecord (org.apache.avro.generic.IndexedRecord)1 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 HoodieAvroUtils (org.apache.hudi.avro.HoodieAvroUtils)1 HoodieClusteringPlan (org.apache.hudi.avro.model.HoodieClusteringPlan)1 WriteStatus (org.apache.hudi.client.WriteStatus)1 HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)1 ConcatenatingIterator (org.apache.hudi.client.utils.ConcatenatingIterator)1 SerializableSchema (org.apache.hudi.common.config.SerializableSchema)1 TypedProperties (org.apache.hudi.common.config.TypedProperties)1 HoodieData (org.apache.hudi.common.data.HoodieData)1