Search in sources :

Example 1 with BoundedInMemoryExecutor

use of org.apache.hudi.common.util.queue.BoundedInMemoryExecutor in project hudi by apache.

the class HoodieMergeHelper method runMerge.

@Override
public void runMerge(HoodieTable<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> table, HoodieMergeHandle<T, HoodieData<HoodieRecord<T>>, HoodieData<HoodieKey>, HoodieData<WriteStatus>> mergeHandle) throws IOException {
    final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation();
    Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf());
    HoodieBaseFile baseFile = mergeHandle.baseFileForMerge();
    final GenericDatumWriter<GenericRecord> gWriter;
    final GenericDatumReader<GenericRecord> gReader;
    Schema readSchema;
    if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) {
        readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema();
        gWriter = new GenericDatumWriter<>(readSchema);
        gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields());
    } else {
        gReader = null;
        gWriter = null;
        readSchema = mergeHandle.getWriterSchemaWithMetaFields();
    }
    BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
    HoodieFileReader<GenericRecord> reader = HoodieFileReaderFactory.getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath());
    try {
        final Iterator<GenericRecord> readerIterator;
        if (baseFile.getBootstrapBaseFile().isPresent()) {
            readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation);
        } else {
            readerIterator = reader.getRecordIterator(readSchema);
        }
        ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
        ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
        wrapper = new BoundedInMemoryExecutor(table.getConfig().getWriteBufferLimitBytes(), readerIterator, new UpdateHandler(mergeHandle), record -> {
            if (!externalSchemaTransformation) {
                return record;
            }
            return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record);
        }, table.getPreExecuteRunnable());
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        if (reader != null) {
            reader.close();
        }
        mergeHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Iterator(java.util.Iterator) HoodieException(org.apache.hudi.exception.HoodieException) BinaryDecoder(org.apache.avro.io.BinaryDecoder) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) Configuration(org.apache.hadoop.conf.Configuration) HoodieKey(org.apache.hudi.common.model.HoodieKey) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Configuration(org.apache.hadoop.conf.Configuration) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) Schema(org.apache.avro.Schema) HoodieException(org.apache.hudi.exception.HoodieException) BinaryDecoder(org.apache.avro.io.BinaryDecoder) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 2 with BoundedInMemoryExecutor

use of org.apache.hudi.common.util.queue.BoundedInMemoryExecutor in project hudi by apache.

the class TestBoundedInMemoryExecutorInSpark method testExecutor.

@Test
public void testExecutor() {
    final List<HoodieRecord> hoodieRecords = dataGen.generateInserts(instantTime, 100);
    HoodieWriteConfig hoodieWriteConfig = mock(HoodieWriteConfig.class);
    when(hoodieWriteConfig.getWriteBufferLimitBytes()).thenReturn(1024);
    BoundedInMemoryQueueConsumer<HoodieLazyInsertIterable.HoodieInsertValueGenResult<HoodieRecord>, Integer> consumer = new BoundedInMemoryQueueConsumer<HoodieLazyInsertIterable.HoodieInsertValueGenResult<HoodieRecord>, Integer>() {

        private int count = 0;

        @Override
        protected void consumeOneRecord(HoodieLazyInsertIterable.HoodieInsertValueGenResult<HoodieRecord> record) {
            count++;
        }

        @Override
        protected void finish() {
        }

        @Override
        protected Integer getResult() {
            return count;
        }
    };
    BoundedInMemoryExecutor<HoodieRecord, Tuple2<HoodieRecord, Option<IndexedRecord>>, Integer> executor = null;
    try {
        executor = new BoundedInMemoryExecutor(hoodieWriteConfig.getWriteBufferLimitBytes(), hoodieRecords.iterator(), consumer, getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA), getPreExecuteRunnable());
        int result = executor.execute();
        // It should buffer and write 100 records
        assertEquals(100, result);
        // There should be no remaining records in the buffer
        assertFalse(executor.isRemaining());
    } finally {
        if (executor != null) {
            executor.shutdownNow();
        }
    }
}
Also used : IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BoundedInMemoryQueueConsumer(org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer) Tuple2(scala.Tuple2) Test(org.junit.jupiter.api.Test)

Example 3 with BoundedInMemoryExecutor

use of org.apache.hudi.common.util.queue.BoundedInMemoryExecutor in project hudi by apache.

the class OrcBootstrapMetadataHandler method executeBootstrap.

@Override
void executeBootstrap(HoodieBootstrapHandle<?, ?, ?, ?> bootstrapHandle, Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception {
    BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void> wrapper = null;
    Reader orcReader = OrcFile.createReader(sourceFilePath, OrcFile.readerOptions(table.getHadoopConf()));
    TypeDescription orcSchema = orcReader.getSchema();
    try (RecordReader reader = orcReader.rows(new Reader.Options(table.getHadoopConf()).schema(orcSchema))) {
        wrapper = new BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void>(config.getWriteBufferLimitBytes(), new OrcReaderIterator(reader, avroSchema, orcSchema), new BootstrapRecordConsumer(bootstrapHandle), inp -> {
            String recKey = keyGenerator.getKey(inp).getRecordKey();
            GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA);
            gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey);
            BootstrapRecordPayload payload = new BootstrapRecordPayload(gr);
            HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload);
            return rec;
        }, table.getPreExecuteRunnable());
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        bootstrapHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) AvroOrcUtils(org.apache.hudi.common.util.AvroOrcUtils) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) HoodieException(org.apache.hudi.exception.HoodieException) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) OrcFile(org.apache.orc.OrcFile) GenericData(org.apache.avro.generic.GenericData) HoodieBootstrapHandle(org.apache.hudi.io.HoodieBootstrapHandle) Logger(org.apache.log4j.Logger) Reader(org.apache.orc.Reader) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Path(org.apache.hadoop.fs.Path) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) RecordReader(org.apache.orc.RecordReader) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) HoodieKey(org.apache.hudi.common.model.HoodieKey) LogManager(org.apache.log4j.LogManager) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) RecordReader(org.apache.orc.RecordReader) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) HoodieException(org.apache.hudi.exception.HoodieException) GenericData(org.apache.avro.generic.GenericData) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) TypeDescription(org.apache.orc.TypeDescription) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 4 with BoundedInMemoryExecutor

use of org.apache.hudi.common.util.queue.BoundedInMemoryExecutor in project hudi by apache.

the class ParquetBootstrapMetadataHandler method executeBootstrap.

@Override
void executeBootstrap(HoodieBootstrapHandle<?, ?, ?, ?> bootstrapHandle, Path sourceFilePath, KeyGeneratorInterface keyGenerator, String partitionPath, Schema avroSchema) throws Exception {
    BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void> wrapper = null;
    try {
        ParquetReader<IndexedRecord> reader = AvroParquetReader.<IndexedRecord>builder(sourceFilePath).withConf(table.getHadoopConf()).build();
        wrapper = new BoundedInMemoryExecutor<GenericRecord, HoodieRecord, Void>(config.getWriteBufferLimitBytes(), new ParquetReaderIterator(reader), new BootstrapRecordConsumer(bootstrapHandle), inp -> {
            String recKey = keyGenerator.getKey(inp).getRecordKey();
            GenericRecord gr = new GenericData.Record(HoodieAvroUtils.RECORD_KEY_SCHEMA);
            gr.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, recKey);
            BootstrapRecordPayload payload = new BootstrapRecordPayload(gr);
            HoodieRecord rec = new HoodieAvroRecord(new HoodieKey(recKey, partitionPath), payload);
            return rec;
        }, table.getPreExecuteRunnable());
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        bootstrapHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) AvroSchemaConverter(org.apache.parquet.avro.AvroSchemaConverter) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) HoodieException(org.apache.hudi.exception.HoodieException) ParquetMetadataConverter(org.apache.parquet.format.converter.ParquetMetadataConverter) KeyGeneratorInterface(org.apache.hudi.keygen.KeyGeneratorInterface) GenericData(org.apache.avro.generic.GenericData) HoodieBootstrapHandle(org.apache.hudi.io.HoodieBootstrapHandle) Logger(org.apache.log4j.Logger) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Path(org.apache.hadoop.fs.Path) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ParquetReader(org.apache.parquet.hadoop.ParquetReader) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) ParquetFileReader(org.apache.parquet.hadoop.ParquetFileReader) MessageType(org.apache.parquet.schema.MessageType) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) AvroParquetReader(org.apache.parquet.avro.AvroParquetReader) HoodieKey(org.apache.hudi.common.model.HoodieKey) ParquetMetadata(org.apache.parquet.hadoop.metadata.ParquetMetadata) LogManager(org.apache.log4j.LogManager) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BootstrapRecordPayload(org.apache.hudi.client.bootstrap.BootstrapRecordPayload) HoodieException(org.apache.hudi.exception.HoodieException) ParquetReaderIterator(org.apache.hudi.common.util.ParquetReaderIterator) GenericData(org.apache.avro.generic.GenericData) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieKey(org.apache.hudi.common.model.HoodieKey) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 5 with BoundedInMemoryExecutor

use of org.apache.hudi.common.util.queue.BoundedInMemoryExecutor in project hudi by apache.

the class TestBoundedInMemoryExecutorInSpark method testInterruptExecutor.

@Test
public void testInterruptExecutor() {
    final List<HoodieRecord> hoodieRecords = dataGen.generateInserts(instantTime, 100);
    HoodieWriteConfig hoodieWriteConfig = mock(HoodieWriteConfig.class);
    when(hoodieWriteConfig.getWriteBufferLimitBytes()).thenReturn(1024);
    BoundedInMemoryQueueConsumer<HoodieLazyInsertIterable.HoodieInsertValueGenResult<HoodieRecord>, Integer> consumer = new BoundedInMemoryQueueConsumer<HoodieLazyInsertIterable.HoodieInsertValueGenResult<HoodieRecord>, Integer>() {

        @Override
        protected void consumeOneRecord(HoodieLazyInsertIterable.HoodieInsertValueGenResult<HoodieRecord> record) {
            try {
                while (true) {
                    Thread.sleep(1000);
                }
            } catch (InterruptedException ie) {
                return;
            }
        }

        @Override
        protected void finish() {
        }

        @Override
        protected Integer getResult() {
            return 0;
        }
    };
    BoundedInMemoryExecutor<HoodieRecord, Tuple2<HoodieRecord, Option<IndexedRecord>>, Integer> executor = null;
    try {
        executor = new BoundedInMemoryExecutor(hoodieWriteConfig.getWriteBufferLimitBytes(), hoodieRecords.iterator(), consumer, getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA), getPreExecuteRunnable());
        BoundedInMemoryExecutor<HoodieRecord, Tuple2<HoodieRecord, Option<IndexedRecord>>, Integer> finalExecutor = executor;
        Thread.currentThread().interrupt();
        assertThrows(HoodieException.class, () -> finalExecutor.execute());
        assertTrue(Thread.interrupted());
    } finally {
        if (executor != null) {
            executor.shutdownNow();
        }
    }
}
Also used : IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BoundedInMemoryQueueConsumer(org.apache.hudi.common.util.queue.BoundedInMemoryQueueConsumer) Tuple2(scala.Tuple2) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieRecord (org.apache.hudi.common.model.HoodieRecord)7 BoundedInMemoryExecutor (org.apache.hudi.common.util.queue.BoundedInMemoryExecutor)7 IOException (java.io.IOException)5 Schema (org.apache.avro.Schema)5 GenericRecord (org.apache.avro.generic.GenericRecord)5 HoodieKey (org.apache.hudi.common.model.HoodieKey)5 HoodieException (org.apache.hudi.exception.HoodieException)5 HoodieTable (org.apache.hudi.table.HoodieTable)5 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)4 Iterator (java.util.Iterator)3 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)3 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)3 IndexedRecord (org.apache.avro.generic.IndexedRecord)3 BinaryDecoder (org.apache.avro.io.BinaryDecoder)3 BinaryEncoder (org.apache.avro.io.BinaryEncoder)3 Configuration (org.apache.hadoop.conf.Configuration)3 WriteStatus (org.apache.hudi.client.WriteStatus)3 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)3 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)3 HoodieMergeHandle (org.apache.hudi.io.HoodieMergeHandle)3