Search in sources :

Example 1 with KeyGeneratorInterface

use of org.apache.hudi.keygen.KeyGeneratorInterface in project hudi by apache.

the class OrcBootstrapMetadataHandler method executeBootstrap.

@Override
void executeBootstrap(HoodieBootstrapHandle<?, ?, ?, ?> bootstrapHandle, Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception {
    BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void> wrapper = null;
    Reader orcReader = OrcFile.createReader(sourceFilePath, OrcFile.readerOptions(table.getHadoopConf()));
    TypeDescription orcSchema = orcReader.getSchema();
    try (RecordReader reader = orcReader.rows(new Reader.Options(table.getHadoopConf()).schema(orcSchema))) {
        wrapper = new BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void>(config.getWriteBufferLimitBytes(), new OrcReaderIterator(reader, avroSchema, orcSchema), new BootstrapRecordConsumer(bootstrapHandle), inp -> {
            String recKey = keyGenerator.getKey(inp).getRecordKey();
            GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA);
            gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey);
            BootstrapRecordPayload payload = new BootstrapRecordPayload(gr);
            HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload);
            return rec;
        }, table.getPreExecuteRunnable());
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        bootstrapHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) AvroOrcUtils(org.apache.hudi.common.util.AvroOrcUtils) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) HoodieException(org.apache.hudi.exception.HoodieException) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) OrcFile(org.apache.orc.OrcFile) GenericData(org.apache.avro.generic.GenericData) HoodieBootstrapHandle(org.apache.hudi.io.HoodieBootstrapHandle) Logger(org.apache.log4j.Logger) Reader(org.apache.orc.Reader) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Path(org.apache.hadoop.fs.Path) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RecordReader(org.apache.orc.RecordReader) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) HoodieException(org.apache.hudi.exception.HoodieException) GenericData(org.apache.avro.generic.GenericData) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) TypeDescription(org.apache.orc.TypeDescription) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 2 with KeyGeneratorInterface

use of org.apache.hudi.keygen.KeyGeneratorInterface in project hudi by apache.

the class ParquetBootstrapMetadataHandler method executeBootstrap.

@Override
void executeBootstrap(HoodieBootstrapHandle<?, ?, ?, ?> bootstrapHandle, Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception {
    BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void> wrapper = null;
    try {
        ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(sourceFilePath).withConf(table.getHadoopConf()).build();
        wrapper = new BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void>(config.getWriteBufferLimitBytes(), new ParquetReaderIterator(reader), new BootstrapRecordConsumer(bootstrapHandle), inp -> {
            String recKey = keyGenerator.getKey(inp).getRecordKey();
            GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA);
            gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey);
            BootstrapRecordPayload payload = new BootstrapRecordPayload(gr);
            HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload);
            return rec;
        }, table.getPreExecuteRunnable());
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        bootstrapHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HoodieException(org.apache.hudi.exception.HoodieException) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) GenericData(org.apache.avro.generic.GenericData) HoodieBootstrapHandle(org.apache.hudi.io.HoodieBootstrapHandle) Logger(org.apache.log4j.Logger) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Path(org.apache.hadoop.fs.Path) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) MessageType(org.apache.parquet.schema.MessageType) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) HoodieKey(org.apache.hudi.common.model.HoodieKey) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) LogManager(org.apache.log4j.LogManager) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) HoodieException(org.apache.hudi.exception.HoodieException) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) GenericData(org.apache.avro.generic.GenericData) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 3 with KeyGeneratorInterface

use of org.apache.hudi.keygen.KeyGeneratorInterface in project hudi by apache.

the class SparkBootstrapCommitActionExecutor method runMetadataBootstrap.

private HoodieData<BootstrapWriteStatus> runMetadataBootstrap(List<Pair<String, List<HoodieFileStatus>>> partitions) {
    if (null == partitions || partitions.isEmpty()) {
        return context.emptyHoodieData();
    }
    TypedProperties properties = new TypedProperties();
    properties.putAll(config.getProps());
    KeyGeneratorInterface keyGenerator;
    try {
        keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(properties);
    } catch (IOException e) {
        throw new HoodieKeyGeneratorException("Init keyGenerator failed ", e);
    }
    BootstrapPartitionPathTranslator translator = (BootstrapPartitionPathTranslator) ReflectionUtils.loadClass(config.getBootstrapPartitionPathTranslatorClass(), properties);
    List<Pair<String, Pair<String, HoodieFileStatus>>> bootstrapPaths = partitions.stream().flatMap(p -> {
        String translatedPartitionPath = translator.getBootstrapTranslatedPath(p.getKey());
        return p.getValue().stream().map(f -> Pair.of(p.getKey(), Pair.of(translatedPartitionPath, f)));
    }).collect(Collectors.toList());
    context.setJobStatus(this.getClass().getSimpleName(), "Bootstrap metadata table.");
    return context.parallelize(bootstrapPaths, config.getBootstrapParallelism()).map(partitionFsPair -> getMetadataHandler(config, table, partitionFsPair.getRight().getRight()).runMetadataBootstrap(partitionFsPair.getLeft(), partitionFsPair.getRight().getLeft(), keyGenerator));
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BaseCommitActionExecutor(org.apache.hudi.table.action.commit.BaseCommitActionExecutor) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) Logger(org.apache.log4j.Logger) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Duration(java.time.Duration) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieSparkKeyGeneratorFactory(org.apache.hudi.keygen.factory.HoodieSparkKeyGeneratorFactory) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) BootstrapPartitionPathTranslator(org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator) Collection(java.util.Collection) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) List(java.util.List) WRITE_STATUS_STORAGE_LEVEL_VALUE(org.apache.hudi.config.HoodieWriteConfig.WRITE_STATUS_STORAGE_LEVEL_VALUE) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) BootstrapWriteStatus(org.apache.hudi.client.bootstrap.BootstrapWriteStatus) SparkBulkInsertCommitActionExecutor(org.apache.hudi.table.action.commit.SparkBulkInsertCommitActionExecutor) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) ReflectionUtils(org.apache.hudi.common.util.ReflectionUtils) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) MetadataBootstrapHandlerFactory.getMetadataHandler(org.apache.hudi.table.action.bootstrap.MetadataBootstrapHandlerFactory.getMetadataHandler) HoodieBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieBootstrapSchemaProvider) Option(org.apache.hudi.common.util.Option) HoodieCommitException(org.apache.hudi.exception.HoodieCommitException) HoodieKeyGeneratorException(org.apache.hudi.exception.HoodieKeyGeneratorException) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) BaseSparkCommitActionExecutor(org.apache.hudi.table.action.commit.BaseSparkCommitActionExecutor) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) HoodieData(org.apache.hudi.common.data.HoodieData) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteMarkersFactory(org.apache.hudi.table.marker.WriteMarkersFactory) Iterator(java.util.Iterator) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieSparkBootstrapSchemaProvider(org.apache.hudi.client.bootstrap.HoodieSparkBootstrapSchemaProvider) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) SparkValidatorUtils(org.apache.hudi.client.utils.SparkValidatorUtils) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) HoodieKeyGeneratorException(org.apache.hudi.exception.HoodieKeyGeneratorException) BootstrapPartitionPathTranslator(org.apache.hudi.client.bootstrap.translator.BootstrapPartitionPathTranslator) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) TypedProperties(org.apache.hudi.common.config.TypedProperties) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

IOException (java.io.IOException)3 HoodieFileStatus (org.apache.hudi.avro.model.HoodieFileStatus)3 HoodieKey (org.apache.hudi.common.model.HoodieKey)3 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)3 Schema (org.apache.avro.Schema)2 GenericData (org.apache.avro.generic.GenericData)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 Path (org.apache.hadoop.fs.Path)2 HoodieAvroUtils (org.apache.hudi.avro.HoodieAvroUtils)2 BootstrapRecordPayload (org.apache.hudi.client.bootstrap.BootstrapRecordPayload)2 HoodieAvroRecord (org.apache.hudi.common.model.HoodieAvroRecord)2 BoundedInMemoryExecutor (org.apache.hudi.common.util.queue.BoundedInMemoryExecutor)2 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)2 HoodieException (org.apache.hudi.exception.HoodieException)2 HoodieBootstrapHandle (org.apache.hudi.io.HoodieBootstrapHandle)2 KeyGeneratorInterface (org.apache.hudi.keygen.KeyGeneratorInterface)2 HoodieTable (org.apache.hudi.table.HoodieTable)2 LogManager (org.apache.log4j.LogManager)2 Logger (org.apache.log4j.Logger)2 StandardCharsets (java.nio.charset.StandardCharsets)1