Search in sources :

Example 56 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class ITTestCompactionCommand method generateCommits.

private void generateCommits() throws IOException {
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
    // Create the write client to write some records in
    HoodieWriteConfig cfg = HoodieWriteConfig.newBuilder().withPath(tablePath).withSchema(HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).forTable(tableName).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).build();
    SparkRDDWriteClient<HoodieAvroPayload> client = new SparkRDDWriteClient<>(new HoodieSparkEngineContext(jsc), cfg);
    List<HoodieRecord> records = insert(jsc, client, dataGen);
    upsert(jsc, client, dataGen, records);
    delete(jsc, client, records);
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieAvroPayload(org.apache.hudi.common.model.HoodieAvroPayload)

Example 57 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class HoodieSparkCopyOnWriteTable method updateColumnsStatsIndex.

private void updateColumnsStatsIndex(@Nonnull HoodieEngineContext context, @Nonnull List<HoodieWriteStat> updatedFilesStats, @Nonnull String instantTime) throws Exception {
    String sortColsList = config.getClusteringSortColumns();
    String basePath = metaClient.getBasePath();
    String indexPath = metaClient.getColumnStatsIndexPath();
    List<String> touchedFiles = updatedFilesStats.stream().map(s -> new Path(basePath, s.getPath()).toString()).collect(Collectors.toList());
    if (touchedFiles.isEmpty() || StringUtils.isNullOrEmpty(sortColsList) || StringUtils.isNullOrEmpty(indexPath)) {
        return;
    }
    LOG.info(String.format("Updating column-statistics index table (%s)", indexPath));
    List<String> sortCols = Arrays.stream(sortColsList.split(",")).map(String::trim).collect(Collectors.toList());
    HoodieSparkEngineContext sparkEngineContext = (HoodieSparkEngineContext) context;
    // Fetch table schema to appropriately construct col-stats index schema
    Schema tableWriteSchema = HoodieAvroUtils.createHoodieWriteSchema(new TableSchemaResolver(metaClient).getTableAvroSchemaWithoutMetadataFields());
    List<String> completedCommits = metaClient.getCommitsTimeline().filterCompletedInstants().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toList());
    ColumnStatsIndexHelper.updateColumnStatsIndexFor(sparkEngineContext.getSqlContext().sparkSession(), AvroConversionUtils.convertAvroSchemaToStructType(tableWriteSchema), touchedFiles, sortCols, indexPath, instantTime, completedCommits);
    LOG.info(String.format("Successfully updated column-statistics index at instant (%s)", instantTime));
}
Also used : SparkDeletePartitionCommitActionExecutor(org.apache.hudi.table.action.commit.SparkDeletePartitionCommitActionExecutor) Arrays(java.util.Arrays) SparkExecuteClusteringCommitActionExecutor(org.apache.hudi.table.action.cluster.SparkExecuteClusteringCommitActionExecutor) HoodieRestorePlan(org.apache.hudi.avro.model.HoodieRestorePlan) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieUpsertException(org.apache.hudi.exception.HoodieUpsertException) SavepointActionExecutor(org.apache.hudi.table.action.savepoint.SavepointActionExecutor) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) HoodieSavepointMetadata(org.apache.hudi.avro.model.HoodieSavepointMetadata) Logger(org.apache.log4j.Logger) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) HoodieSortedMergeHandle(org.apache.hudi.io.HoodieSortedMergeHandle) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) ColumnStatsIndexHelper(org.apache.hudi.index.columnstats.ColumnStatsIndexHelper) RestorePlanActionExecutor(org.apache.hudi.table.action.rollback.RestorePlanActionExecutor) Schema(org.apache.avro.Schema) SparkUpsertPreppedCommitActionExecutor(org.apache.hudi.table.action.commit.SparkUpsertPreppedCommitActionExecutor) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) HoodieCreateHandle(org.apache.hudi.io.HoodieCreateHandle) CleanActionExecutor(org.apache.hudi.table.action.clean.CleanActionExecutor) HoodieRollbackPlan(org.apache.hudi.avro.model.HoodieRollbackPlan) Collectors(java.util.stream.Collectors) BaseRollbackPlanActionExecutor(org.apache.hudi.table.action.rollback.BaseRollbackPlanActionExecutor) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) SparkBulkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) CopyOnWriteRollbackActionExecutor(org.apache.hudi.table.action.rollback.CopyOnWriteRollbackActionExecutor) HoodieBootstrapWriteMetadata(org.apache.hudi.table.action.bootstrap.HoodieBootstrapWriteMetadata) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HoodieMergeHelper(org.apache.hudi.table.action.commit.HoodieMergeHelper) SparkBulkInsertPreppedCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertPreppedCommitActionExecutor) AvroConversionUtils(org.apache.hudi.AvroConversionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) StringUtils(org.apache.hudi.common.util.StringUtils) SparkInsertPreppedCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertPreppedCommitActionExecutor) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) CleanPlanActionExecutor(org.apache.hudi.table.action.clean.CleanPlanActionExecutor) SparkUpsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor) SparkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertCommitActionExecutor) Nonnull(javax.annotation.Nonnull) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieNotSupportedException(org.apache.hudi.exception.HoodieNotSupportedException) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ClusteringPlanActionExecutor(org.apache.hudi.table.action.cluster.ClusteringPlanActionExecutor) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) SparkDeleteCommitActionExecutor(org.apache.hudi.table.action.commit.SparkDeleteCommitActionExecutor) IOException(java.io.IOException) SparkBootstrapCommitActionExecutor(org.apache.hudi.table.action.bootstrap.SparkBootstrapCommitActionExecutor) SparkInsertOverwriteCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertOverwriteCommitActionExecutor) CopyOnWriteRestoreActionExecutor(org.apache.hudi.table.action.restore.CopyOnWriteRestoreActionExecutor) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) SparkInsertOverwriteTableCommitActionExecutor(org.apache.hudi.table.action.commit.SparkInsertOverwriteTableCommitActionExecutor) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Schema(org.apache.avro.Schema) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver)

Example 58 with HoodieSparkEngineContext

use of org.apache.hudi.client.common.HoodieSparkEngineContext in project hudi by apache.

the class SparkRDDWriteClient method initWrapperFSMetrics.

@Override
protected void initWrapperFSMetrics() {
    if (config.isMetricsOn()) {
        Registry registry;
        Registry registryMeta;
        JavaSparkContext jsc = ((HoodieSparkEngineContext) context).getJavaSparkContext();
        if (config.isExecutorMetricsEnabled()) {
            // Create a distributed registry for HoodieWrapperFileSystem
            registry = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName(), DistributedRegistry.class.getName());
            ((DistributedRegistry) registry).register(jsc);
            registryMeta = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName() + "MetaFolder", DistributedRegistry.class.getName());
            ((DistributedRegistry) registryMeta).register(jsc);
        } else {
            registry = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName());
            registryMeta = Registry.getRegistry(HoodieWrapperFileSystem.class.getSimpleName() + "MetaFolder");
        }
        HoodieWrapperFileSystem.setMetricsRegistry(registry, registryMeta);
    }
}
Also used : HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) DistributedRegistry(org.apache.hudi.metrics.DistributedRegistry) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) DistributedRegistry(org.apache.hudi.metrics.DistributedRegistry) Registry(org.apache.hudi.common.metrics.Registry) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Aggregations

HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)58 Path (org.apache.hadoop.fs.Path)25 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)24 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 ArrayList (java.util.ArrayList)19 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)19 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)17 WriteStatus (org.apache.hudi.client.WriteStatus)15 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)15 IOException (java.io.IOException)14 List (java.util.List)14 Option (org.apache.hudi.common.util.Option)14 LogManager (org.apache.log4j.LogManager)14 Logger (org.apache.log4j.Logger)14 Test (org.junit.jupiter.api.Test)14 Collectors (java.util.stream.Collectors)12 FileStatus (org.apache.hadoop.fs.FileStatus)12 FileSystem (org.apache.hadoop.fs.FileSystem)12 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)11