Search in sources :

Example 31 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class TestBucketAssigner method before.

@BeforeEach
public void before() throws IOException {
    final String basePath = tempFile.getAbsolutePath();
    conf = TestConfigurations.getDefaultConf(basePath);
    writeConfig = StreamerUtil.getHoodieClientConfig(conf);
    context = new HoodieFlinkEngineContext(new SerializableConfiguration(StreamerUtil.getHadoopConf()), new FlinkTaskContextSupplier(null));
    StreamerUtil.initTableIfNotExists(conf);
}
Also used : SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieFlinkEngineContext(org.apache.hudi.client.common.HoodieFlinkEngineContext) FlinkTaskContextSupplier(org.apache.hudi.client.FlinkTaskContextSupplier) BeforeEach(org.junit.jupiter.api.BeforeEach)

Example 32 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class MultipleSparkJobExecutionStrategy method readRecordsForGroupBaseFiles.

/**
 * Read records from baseFiles and convert to RDD.
 */
private HoodieData<HoodieRecord<T>> readRecordsForGroupBaseFiles(JavaSparkContext jsc, List<ClusteringOperation> clusteringOps) {
    SerializableConfiguration hadoopConf = new SerializableConfiguration(getHoodieTable().getHadoopConf());
    HoodieWriteConfig writeConfig = getWriteConfig();
    // closure, as this might lead to issues attempting to serialize its nested fields
    return HoodieJavaRDD.of(jsc.parallelize(clusteringOps, clusteringOps.size()).mapPartitions(clusteringOpsPartition -> {
        List<Iterator<IndexedRecord>> iteratorsForPartition = new ArrayList<>();
        clusteringOpsPartition.forEachRemaining(clusteringOp -> {
            try {
                Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(writeConfig.getSchema()));
                HoodieFileReader<IndexedRecord> baseFileReader = HoodieFileReaderFactory.getFileReader(hadoopConf.get(), new Path(clusteringOp.getDataFilePath()));
                iteratorsForPartition.add(baseFileReader.getRecordIterator(readerSchema));
            } catch (IOException e) {
                throw new HoodieClusteringException("Error reading input data for " + clusteringOp.getDataFilePath() + " and " + clusteringOp.getDeltaFilePaths(), e);
            }
        });
        return new ConcatenatingIterator<>(iteratorsForPartition);
    }).map(record -> transform(record, writeConfig)));
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) RewriteAvroPayload(org.apache.hudi.common.model.RewriteAvroPayload) ConcatenatingIterator(org.apache.hudi.client.utils.ConcatenatingIterator) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) BaseKeyGenerator(org.apache.hudi.keygen.BaseKeyGenerator) Logger(org.apache.log4j.Logger) RDDSpatialCurveSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDSpatialCurveSortPartitioner) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) PLAN_STRATEGY_SORT_COLUMNS(org.apache.hudi.config.HoodieClusteringConfig.PLAN_STRATEGY_SORT_COLUMNS) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) ClusteringOperation(org.apache.hudi.common.model.ClusteringOperation) Collectors(java.util.stream.Collectors) List(java.util.List) Stream(java.util.stream.Stream) KeyGenUtils(org.apache.hudi.keygen.KeyGenUtils) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) CompletableFuture(java.util.concurrent.CompletableFuture) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) HoodieFileSliceReader.getFileSliceReader(org.apache.hudi.common.table.log.HoodieFileSliceReader.getFileSliceReader) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) BulkInsertPartitioner(org.apache.hudi.table.BulkInsertPartitioner) ClusteringExecutionStrategy(org.apache.hudi.table.action.cluster.strategy.ClusteringExecutionStrategy) IndexedRecord(org.apache.avro.generic.IndexedRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) RDDCustomColumnsSortPartitioner(org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) FutureUtils(org.apache.hudi.common.util.FutureUtils) HoodieClusteringException(org.apache.hudi.exception.HoodieClusteringException) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) IOUtils(org.apache.hudi.io.IOUtils) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) IndexedRecord(org.apache.avro.generic.IndexedRecord) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Schema(org.apache.avro.Schema) ConcatenatingIterator(org.apache.hudi.client.utils.ConcatenatingIterator) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieClusteringException(org.apache.hudi.exception.HoodieClusteringException) ConcatenatingIterator(org.apache.hudi.client.utils.ConcatenatingIterator) Iterator(java.util.Iterator)

Aggregations

SerializableConfiguration (org.apache.hudi.common.config.SerializableConfiguration)32 Path (org.apache.hadoop.fs.Path)20 FileSystem (org.apache.hadoop.fs.FileSystem)16 FileStatus (org.apache.hadoop.fs.FileStatus)15 List (java.util.List)14 IOException (java.io.IOException)13 Collectors (java.util.stream.Collectors)13 Map (java.util.Map)12 Test (org.junit.jupiter.api.Test)12 ArrayList (java.util.ArrayList)11 LogManager (org.apache.log4j.LogManager)10 Logger (org.apache.log4j.Logger)10 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)9 Option (org.apache.hudi.common.util.Option)9 Arrays (java.util.Arrays)8 HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)8 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)8 HoodieIOException (org.apache.hudi.exception.HoodieIOException)7 Collections (java.util.Collections)6 Configuration (org.apache.hadoop.conf.Configuration)6