Search in sources :

Example 1 with OrcReaderIterator

use of org.apache.hudi.common.util.OrcReaderIterator in project hudi by apache.

the class TestOrcBootstrap method generateInputBatch.

private static JavaRDD<HoodieRecord> generateInputBatch(JavaSparkContext jsc, List<Pair<String, List<HoodieFileStatus>>> partitionPaths, Schema writerSchema) {
    List<Pair<String, Path>> fullFilePathsWithPartition = partitionPaths.stream().flatMap(p -> p.getValue().stream().map(x -> Pair.of(p.getKey(), FileStatusUtils.toPath(x.getPath())))).collect(Collectors.toList());
    return jsc.parallelize(fullFilePathsWithPartition.stream().flatMap(p -> {
        try {
            Configuration conf = jsc.hadoopConfiguration();
            AvroReadSupport.setAvroReadSchema(conf, writerSchema);
            Reader orcReader = OrcFile.createReader(p.getValue(), new OrcFile.ReaderOptions(jsc.hadoopConfiguration()));
            RecordReader recordReader = orcReader.rows();
            TypeDescription orcSchema = orcReader.getSchema();
            Schema avroSchema = AvroOrcUtils.createAvroSchemaWithDefaultValue(orcSchema, "test_orc_record", null, true);
            Iterator<GenericRecord> recIterator = new OrcReaderIterator(recordReader, avroSchema, orcSchema);
            return StreamSupport.stream(Spliterators.spliteratorUnknownSize(recIterator, 0), false).map(gr -> {
                try {
                    String key = gr.get("_row_key").toString();
                    String pPath = p.getKey();
                    return new HoodieAvroRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
                } catch (IOException e) {
                    throw new HoodieIOException(e.getMessage(), e);
                }
            });
        } catch (IOException ioe) {
            throw new HoodieIOException(ioe.getMessage(), ioe);
        }
    }).collect(Collectors.toList()));
}
Also used : BootstrapUtils(org.apache.hudi.table.action.bootstrap.BootstrapUtils) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) Spliterators(java.util.Spliterators) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Random(java.util.Random) HoodieParquetRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Tag(org.junit.jupiter.api.Tag) DataTypes(org.apache.spark.sql.types.DataTypes) Schema(org.apache.avro.Schema) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) MetadataOnlyBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) IntStream(java.util.stream.IntStream) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) AvroOrcUtils(org.apache.hudi.common.util.AvroOrcUtils) Dataset(org.apache.spark.sql.Dataset) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) OrcFile(org.apache.orc.OrcFile) ArrayList(java.util.ArrayList) Reader(org.apache.orc.Reader) Collectors.mapping(java.util.stream.Collectors.mapping) StreamSupport(java.util.stream.StreamSupport) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) FullRecordBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) SaveMode(org.apache.spark.sql.SaveMode) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Column(org.apache.spark.sql.Column) RecordReader(org.apache.orc.RecordReader) SQLContext(org.apache.spark.sql.SQLContext) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) JobConf(org.apache.hadoop.mapred.JobConf) AfterEach(org.junit.jupiter.api.AfterEach) Collectors.toList(java.util.stream.Collectors.toList) UDF1(org.apache.spark.sql.api.java.UDF1) HoodieKey(org.apache.hudi.common.model.HoodieKey) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) PartitionPathEncodeUtils(org.apache.hudi.common.util.PartitionPathEncodeUtils) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) org.apache.spark.sql.functions.callUDF(org.apache.spark.sql.functions.callUDF) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBootstrapConfig(org.apache.hudi.config.HoodieBootstrapConfig) Configuration(org.apache.hadoop.conf.Configuration) RecordReader(org.apache.orc.RecordReader) Schema(org.apache.avro.Schema) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieIOException(org.apache.hudi.exception.HoodieIOException) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) OrcFile(org.apache.orc.OrcFile) HoodieKey(org.apache.hudi.common.model.HoodieKey) TypeDescription(org.apache.orc.TypeDescription) GenericRecord(org.apache.avro.generic.GenericRecord) Pair(org.apache.hudi.common.util.collection.Pair)

Example 2 with OrcReaderIterator

use of org.apache.hudi.common.util.OrcReaderIterator in project hudi by apache.

the class OrcBootstrapMetadataHandler method executeBootstrap.

@Override
void executeBootstrap(HoodieBootstrapHandle<?, ?, ?, ?> bootstrapHandle, Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception {
    BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void> wrapper = null;
    Reader orcReader = OrcFile.createReader(sourceFilePath, OrcFile.readerOptions(table.getHadoopConf()));
    TypeDescription orcSchema = orcReader.getSchema();
    try (RecordReader reader = orcReader.rows(new Reader.Options(table.getHadoopConf()).schema(orcSchema))) {
        wrapper = new BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void>(config.getWriteBufferLimitBytes(), new OrcReaderIterator(reader, avroSchema, orcSchema), new BootstrapRecordConsumer(bootstrapHandle), inp -> {
            String recKey = keyGenerator.getKey(inp).getRecordKey();
            GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA);
            gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey);
            BootstrapRecordPayload payload = new BootstrapRecordPayload(gr);
            HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload);
            return rec;
        }, table.getPreExecuteRunnable());
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        bootstrapHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) AvroOrcUtils(org.apache.hudi.common.util.AvroOrcUtils) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) HoodieException(org.apache.hudi.exception.HoodieException) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) OrcFile(org.apache.orc.OrcFile) GenericData(org.apache.avro.generic.GenericData) HoodieBootstrapHandle(org.apache.hudi.io.HoodieBootstrapHandle) Logger(org.apache.log4j.Logger) Reader(org.apache.orc.Reader) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Path(org.apache.hadoop.fs.Path) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RecordReader(org.apache.orc.RecordReader) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) HoodieException(org.apache.hudi.exception.HoodieException) GenericData(org.apache.avro.generic.GenericData) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) TypeDescription(org.apache.orc.TypeDescription) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 3 with OrcReaderIterator

use of org.apache.hudi.common.util.OrcReaderIterator in project hudi by apache.

the class HoodieOrcReader method getRecordIterator.

@Override
public Iterator<R> getRecordIterator(Schema schema) throws IOException {
    try {
        Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf));
        TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(schema);
        RecordReader recordReader = reader.rows(new Options(conf).schema(orcSchema));
        return new OrcReaderIterator(recordReader, schema, orcSchema);
    } catch (IOException io) {
        throw new HoodieIOException("Unable to create an ORC reader.", io);
    }
}
Also used : Options(org.apache.orc.Reader.Options) HoodieIOException(org.apache.hudi.exception.HoodieIOException) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) RecordReader(org.apache.orc.RecordReader) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Aggregations

IOException (java.io.IOException)3 OrcReaderIterator (org.apache.hudi.common.util.OrcReaderIterator)3 Reader (org.apache.orc.Reader)3 RecordReader (org.apache.orc.RecordReader)3 TypeDescription (org.apache.orc.TypeDescription)3 Schema (org.apache.avro.Schema)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Path (org.apache.hadoop.fs.Path)2 HoodieFileStatus (org.apache.hudi.avro.model.HoodieFileStatus)2 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)2 HoodieKey (org.apache.hudi.common.model.HoodieKey)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 AvroOrcUtils (org.apache.hudi.common.util.AvroOrcUtils)2 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)2 OrcFile (org.apache.orc.OrcFile)2 Instant (java.time.Instant)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Iterator (java.util.Iterator)1 List (java.util.List)1