Search in sources :

Example 1 with ParquetReaderIterator

use of org.apache.hudi.common.util.ParquetReaderIterator in project hudi by apache.

the class ParquetBootstrapMetadataHandler method executeBootstrap.

@Override
void executeBootstrap(HoodieBootstrapHandle<?, ?, ?, ?> bootstrapHandle, Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception {
    BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void> wrapper = null;
    try {
        ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(sourceFilePath).withConf(table.getHadoopConf()).build();
        wrapper = new BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void>(config.getWriteBufferLimitBytes(), new ParquetReaderIterator(reader), new BootstrapRecordConsumer(bootstrapHandle), inp -> {
            String recKey = keyGenerator.getKey(inp).getRecordKey();
            GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA);
            gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey);
            BootstrapRecordPayload payload = new BootstrapRecordPayload(gr);
            HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload);
            return rec;
        }, table.getPreExecuteRunnable());
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        bootstrapHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HoodieException(org.apache.hudi.exception.HoodieException) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) GenericData(org.apache.avro.generic.GenericData) HoodieBootstrapHandle(org.apache.hudi.io.HoodieBootstrapHandle) Logger(org.apache.log4j.Logger) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Path(org.apache.hadoop.fs.Path) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) MessageType(org.apache.parquet.schema.MessageType) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) HoodieKey(org.apache.hudi.common.model.HoodieKey) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) LogManager(org.apache.log4j.LogManager) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) HoodieException(org.apache.hudi.exception.HoodieException) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) GenericData(org.apache.avro.generic.GenericData) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 2 with ParquetReaderIterator

use of org.apache.hudi.common.util.ParquetReaderIterator in project hudi by apache.

the class TestBootstrap method generateInputBatch.

private static JavaRDD<HoodieRecord> generateInputBatch(JavaSparkContext jsc, List<Pair<String, List<HoodieFileStatus>>> partitionPaths, Schema writerSchema) {
    List<Pair<String, Path>> fullFilePathsWithPartition = partitionPaths.stream().flatMap(p -> p.getValue().stream().map(x -> Pair.of(p.getKey(), FileStatusUtils.toPath(x.getPath())))).collect(Collectors.toList());
    return jsc.parallelize(fullFilePathsWithPartition.stream().flatMap(p -> {
        try {
            Configuration conf = jsc.hadoopConfiguration();
            AvroReadSupport.setAvroReadSchema(conf, writerSchema);
            Iterator<GenericRecord> recIterator = new ParquetReaderIterator(AvroParquetReader.<GenericRecord>builder(p.getValue()).withConf(conf).build());
            return StreamSupport.stream(Spliterators.spliteratorUnknownSize(recIterator, 0), false).map(gr -> {
                try {
                    String key = gr.get("_row_key").toString();
                    String pPath = p.getKey();
                    return new HoodieAvroRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
                } catch (IOException e) {
                    throw new HoodieIOException(e.getMessage(), e);
                }
            });
        } catch (IOException ioe) {
            throw new HoodieIOException(ioe.getMessage(), ioe);
        }
    }).collect(Collectors.toList()));
}
Also used : BootstrapUtils(org.apache.hudi.table.action.bootstrap.BootstrapUtils) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) Spliterators(java.util.Spliterators) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Random(java.util.Random) HoodieParquetRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) LongWritable(org.apache.hadoop.io.LongWritable) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Tag(org.junit.jupiter.api.Tag) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) DataTypes(org.apache.spark.sql.types.DataTypes) Schema(org.apache.avro.Schema) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) Set(java.util.Set) MetadataOnlyBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) IntStream(java.util.stream.IntStream) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) Dataset(org.apache.spark.sql.Dataset) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) Collectors.mapping(java.util.stream.Collectors.mapping) StreamSupport(java.util.stream.StreamSupport) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) FullRecordBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) SaveMode(org.apache.spark.sql.SaveMode) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Column(org.apache.spark.sql.Column) SQLContext(org.apache.spark.sql.SQLContext) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) JobConf(org.apache.hadoop.mapred.JobConf) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) AfterEach(org.junit.jupiter.api.AfterEach) Collectors.toList(java.util.stream.Collectors.toList) UDF1(org.apache.spark.sql.api.java.UDF1) HoodieKey(org.apache.hudi.common.model.HoodieKey) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) PartitionPathEncodeUtils(org.apache.hudi.common.util.PartitionPathEncodeUtils) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) org.apache.spark.sql.functions.callUDF(org.apache.spark.sql.functions.callUDF) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBootstrapConfig(org.apache.hudi.config.HoodieBootstrapConfig) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) GenericRecord(org.apache.avro.generic.GenericRecord) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

IOException (java.io.IOException)2 Schema (org.apache.avro.Schema)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Path (org.apache.hadoop.fs.Path)2 HoodieFileStatus (org.apache.hudi.avro.model.HoodieFileStatus)2 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)2 HoodieKey (org.apache.hudi.common.model.HoodieKey)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 Instant (java.time.Instant)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 HashSet (java.util.HashSet)1 Iterator (java.util.Iterator)1 List (java.util.List)1 Map (java.util.Map)1 Random (java.util.Random)1 Set (java.util.Set)1 Spliterators (java.util.Spliterators)1 Collectors (java.util.stream.Collectors)1 Collectors.mapping (java.util.stream.Collectors.mapping)1