Search in sources :

Example 1 with IteratorBasedQueueProducer

use of org.apache.hudi.common.util.queue.IteratorBasedQueueProducer in project hudi by apache.

the class TestBoundedInMemoryQueue method testCompositeProducerRecordReading.

/**
 * Test to ensure that we are reading all records from queue iterator when we have multiple producers.
 */
@SuppressWarnings("unchecked")
@Test
@Timeout(value = 60)
public void testCompositeProducerRecordReading() throws Exception {
    final int numRecords = 1000;
    final int numProducers = 40;
    final List<List<HoodieRecord>> recs = new ArrayList<>();
    final BoundedInMemoryQueue<HoodieRecord, HoodieLazyInsertIterable.HoodieInsertValueGenResult> queue = new BoundedInMemoryQueue(FileIOUtils.KB, getTransformFunction(HoodieTestDataGenerator.AVRO_SCHEMA));
    // Record Key to <Producer Index, Rec Index within a producer>
    Map<String, Tuple2<Integer, Integer>> keyToProducerAndIndexMap = new HashMap<>();
    for (int i = 0; i < numProducers; i++) {
        List<HoodieRecord> pRecs = dataGen.generateInserts(instantTime, numRecords);
        int j = 0;
        for (HoodieRecord r : pRecs) {
            assertFalse(keyToProducerAndIndexMap.containsKey(r.getRecordKey()));
            keyToProducerAndIndexMap.put(r.getRecordKey(), new Tuple2<>(i, j));
            j++;
        }
        recs.add(pRecs);
    }
    List<BoundedInMemoryQueueProducer<HoodieRecord>> producers = new ArrayList<>();
    for (int i = 0; i < recs.size(); i++) {
        final List<HoodieRecord> r = recs.get(i);
        // Alternate between pull and push based iterators
        if (i % 2 == 0) {
            producers.add(new IteratorBasedQueueProducer<>(r.iterator()));
        } else {
            producers.add(new FunctionBasedQueueProducer<>((buf) -> {
                Iterator<HoodieRecord> itr = r.iterator();
                while (itr.hasNext()) {
                    try {
                        buf.insertRecord(itr.next());
                    } catch (Exception e) {
                        throw new HoodieException(e);
                    }
                }
                return true;
            }));
        }
    }
    final List<Future<Boolean>> futureList = producers.stream().map(producer -> {
        return executorService.submit(() -> {
            producer.produce(queue);
            return true;
        });
    }).collect(Collectors.toList());
    // Close queue
    Future<Boolean> closeFuture = executorService.submit(() -> {
        try {
            for (Future f : futureList) {
                f.get();
            }
            queue.close();
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
        return true;
    });
    // Used to ensure that consumer sees the records generated by a single producer in FIFO order
    Map<Integer, Integer> lastSeenMap = IntStream.range(0, numProducers).boxed().collect(Collectors.toMap(Function.identity(), x -> -1));
    Map<Integer, Integer> countMap = IntStream.range(0, numProducers).boxed().collect(Collectors.toMap(Function.identity(), x -> 0));
    // Read recs and ensure we have covered all producer recs.
    while (queue.iterator().hasNext()) {
        final HoodieLazyInsertIterable.HoodieInsertValueGenResult payload = queue.iterator().next();
        final HoodieRecord rec = payload.record;
        Tuple2<Integer, Integer> producerPos = keyToProducerAndIndexMap.get(rec.getRecordKey());
        Integer lastSeenPos = lastSeenMap.get(producerPos._1());
        countMap.put(producerPos._1(), countMap.get(producerPos._1()) + 1);
        lastSeenMap.put(producerPos._1(), lastSeenPos + 1);
        // Ensure we are seeing the next record generated
        assertEquals(lastSeenPos + 1, producerPos._2().intValue());
    }
    for (int i = 0; i < numProducers; i++) {
        // Ensure we have seen all the records for each producers
        assertEquals(Integer.valueOf(numRecords), countMap.get(i));
    }
    // Ensure Close future is done
    closeFuture.get();
}
Also used : IntStream(java.util.stream.IntStream) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) BeforeEach(org.junit.jupiter.api.BeforeEach) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieLazyInsertIterable.getTransformFunction(org.apache.hudi.execution.HoodieLazyInsertIterable.getTransformFunction) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) Function(java.util.function.Function) HoodieClientTestHarness(org.apache.hudi.testutils.HoodieClientTestHarness) BoundedInMemoryQueueProducer(org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer) ArrayList(java.util.ArrayList) Future(java.util.concurrent.Future) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Map(java.util.Map) DefaultSizeEstimator(org.apache.hudi.common.util.DefaultSizeEstimator) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) FunctionBasedQueueProducer(org.apache.hudi.common.util.queue.FunctionBasedQueueProducer) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Iterator(java.util.Iterator) Semaphore(java.util.concurrent.Semaphore) IteratorBasedQueueProducer(org.apache.hudi.common.util.queue.IteratorBasedQueueProducer) Mockito.when(org.mockito.Mockito.when) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) Test(org.junit.jupiter.api.Test) ExecutionException(java.util.concurrent.ExecutionException) AfterEach(org.junit.jupiter.api.AfterEach) List(java.util.List) SizeEstimator(org.apache.hudi.common.util.SizeEstimator) BoundedInMemoryQueue(org.apache.hudi.common.util.queue.BoundedInMemoryQueue) Timeout(org.junit.jupiter.api.Timeout) Mockito.mock(org.mockito.Mockito.mock) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) HoodieException(org.apache.hudi.exception.HoodieException) ExecutionException(java.util.concurrent.ExecutionException) BoundedInMemoryQueueProducer(org.apache.hudi.common.util.queue.BoundedInMemoryQueueProducer) Tuple2(scala.Tuple2) BoundedInMemoryQueue(org.apache.hudi.common.util.queue.BoundedInMemoryQueue) Future(java.util.concurrent.Future) Test(org.junit.jupiter.api.Test) Timeout(org.junit.jupiter.api.Timeout)

Example 2 with IteratorBasedQueueProducer

use of org.apache.hudi.common.util.queue.IteratorBasedQueueProducer in project hudi by apache.

the class JavaMergeHelper method runMerge.

@Override
public void runMerge(HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table, HoodieMergeHandle<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> upsertHandle) throws IOException {
    final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation();
    Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf());
    HoodieMergeHandle<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> mergeHandle = upsertHandle;
    HoodieBaseFile baseFile = mergeHandle.baseFileForMerge();
    final GenericDatumWriter<GenericRecord> gWriter;
    final GenericDatumReader<GenericRecord> gReader;
    Schema readSchema;
    if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) {
        readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema();
        gWriter = new GenericDatumWriter<>(readSchema);
        gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields());
    } else {
        gReader = null;
        gWriter = null;
        readSchema = mergeHandle.getWriterSchemaWithMetaFields();
    }
    BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
    HoodieFileReader<GenericRecord> reader = HoodieFileReaderFactory.<GenericRecord>getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath());
    try {
        final Iterator<GenericRecord> readerIterator;
        if (baseFile.getBootstrapBaseFile().isPresent()) {
            readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation);
        } else {
            readerIterator = reader.getRecordIterator(readSchema);
        }
        ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
        ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
        wrapper = new BoundedInMemoryExecutor<>(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), Option.of(new UpdateHandler(mergeHandle)), record -> {
            if (!externalSchemaTransformation) {
                return record;
            }
            return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record);
        });
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        if (reader != null) {
            reader.close();
        }
        mergeHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Iterator(java.util.Iterator) IteratorBasedQueueProducer(org.apache.hudi.common.util.queue.IteratorBasedQueueProducer) HoodieException(org.apache.hudi.exception.HoodieException) BinaryDecoder(org.apache.avro.io.BinaryDecoder) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) Configuration(org.apache.hadoop.conf.Configuration) HoodieKey(org.apache.hudi.common.model.HoodieKey) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) HoodieException(org.apache.hudi.exception.HoodieException) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord) BinaryDecoder(org.apache.avro.io.BinaryDecoder) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) IteratorBasedQueueProducer(org.apache.hudi.common.util.queue.IteratorBasedQueueProducer)

Example 3 with IteratorBasedQueueProducer

use of org.apache.hudi.common.util.queue.IteratorBasedQueueProducer in project hudi by apache.

the class JavaLazyInsertIterable method computeNext.

@Override
protected List<WriteStatus> computeNext() {
    // Executor service used for launching writer thread.
    BoundedInMemoryExecutor<HoodieRecord<T>, HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> bufferedIteratorExecutor = null;
    try {
        final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema());
        bufferedIteratorExecutor = new BoundedInMemoryExecutor<>(hoodieConfig.getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(inputItr), Option.of(getInsertHandler()), getTransformFunction(schema));
        final List<WriteStatus> result = bufferedIteratorExecutor.execute();
        assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining();
        return result;
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        if (null != bufferedIteratorExecutor) {
            bufferedIteratorExecutor.shutdownNow();
        }
    }
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) List(java.util.List) HoodieException(org.apache.hudi.exception.HoodieException) IteratorBasedQueueProducer(org.apache.hudi.common.util.queue.IteratorBasedQueueProducer) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieException(org.apache.hudi.exception.HoodieException)

Example 4 with IteratorBasedQueueProducer

use of org.apache.hudi.common.util.queue.IteratorBasedQueueProducer in project hudi by apache.

the class FlinkMergeHelper method runMerge.

@Override
public void runMerge(HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table, HoodieMergeHandle<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> mergeHandle) throws IOException {
    final GenericDatumWriter<GenericRecord> gWriter;
    final GenericDatumReader<GenericRecord> gReader;
    Schema readSchema;
    final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation();
    HoodieBaseFile baseFile = mergeHandle.baseFileForMerge();
    if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) {
        readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema();
        gWriter = new GenericDatumWriter<>(readSchema);
        gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields());
    } else {
        gReader = null;
        gWriter = null;
        readSchema = mergeHandle.getWriterSchemaWithMetaFields();
    }
    BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
    Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf());
    HoodieFileReader<GenericRecord> reader = HoodieFileReaderFactory.<GenericRecord>getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath());
    try {
        final Iterator<GenericRecord> readerIterator;
        if (baseFile.getBootstrapBaseFile().isPresent()) {
            readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation);
        } else {
            readerIterator = reader.getRecordIterator(readSchema);
        }
        ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
        ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
        wrapper = new BoundedInMemoryExecutor<>(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), Option.of(new UpdateHandler(mergeHandle)), record -> {
            if (!externalSchemaTransformation) {
                return record;
            }
            return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record);
        });
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        if (reader != null) {
            reader.close();
        }
        mergeHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Iterator(java.util.Iterator) List(scala.collection.immutable.List) IteratorBasedQueueProducer(org.apache.hudi.common.util.queue.IteratorBasedQueueProducer) HoodieException(org.apache.hudi.exception.HoodieException) BinaryDecoder(org.apache.avro.io.BinaryDecoder) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) Configuration(org.apache.hadoop.conf.Configuration) HoodieKey(org.apache.hudi.common.model.HoodieKey) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) HoodieException(org.apache.hudi.exception.HoodieException) BinaryDecoder(org.apache.avro.io.BinaryDecoder) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) GenericRecord(org.apache.avro.generic.GenericRecord) IteratorBasedQueueProducer(org.apache.hudi.common.util.queue.IteratorBasedQueueProducer)

Example 5 with IteratorBasedQueueProducer

use of org.apache.hudi.common.util.queue.IteratorBasedQueueProducer in project hudi by apache.

the class FlinkLazyInsertIterable method computeNext.

@Override
protected List<WriteStatus> computeNext() {
    // Executor service used for launching writer thread.
    BoundedInMemoryExecutor<HoodieRecord<T>, HoodieInsertValueGenResult<HoodieRecord>, List<WriteStatus>> bufferedIteratorExecutor = null;
    try {
        final Schema schema = new Schema.Parser().parse(hoodieConfig.getSchema());
        bufferedIteratorExecutor = new BoundedInMemoryExecutor<>(hoodieConfig.getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(inputItr), Option.of(getExplicitInsertHandler()), getTransformFunction(schema, hoodieConfig));
        final List<WriteStatus> result = bufferedIteratorExecutor.execute();
        assert result != null && !result.isEmpty() && !bufferedIteratorExecutor.isRemaining();
        return result;
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        if (null != bufferedIteratorExecutor) {
            bufferedIteratorExecutor.shutdownNow();
        }
    }
}
Also used : HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Schema(org.apache.avro.Schema) List(java.util.List) HoodieException(org.apache.hudi.exception.HoodieException) IteratorBasedQueueProducer(org.apache.hudi.common.util.queue.IteratorBasedQueueProducer) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieException(org.apache.hudi.exception.HoodieException)

Aggregations

HoodieRecord (org.apache.hudi.common.model.HoodieRecord)5 IteratorBasedQueueProducer (org.apache.hudi.common.util.queue.IteratorBasedQueueProducer)5 HoodieException (org.apache.hudi.exception.HoodieException)5 List (java.util.List)4 Schema (org.apache.avro.Schema)4 WriteStatus (org.apache.hudi.client.WriteStatus)4 Iterator (java.util.Iterator)3 Option (org.apache.hudi.common.util.Option)3 IOException (java.io.IOException)2 GenericDatumReader (org.apache.avro.generic.GenericDatumReader)2 GenericDatumWriter (org.apache.avro.generic.GenericDatumWriter)2 GenericRecord (org.apache.avro.generic.GenericRecord)2 BinaryDecoder (org.apache.avro.io.BinaryDecoder)2 BinaryEncoder (org.apache.avro.io.BinaryEncoder)2 Configuration (org.apache.hadoop.conf.Configuration)2 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)2 HoodieKey (org.apache.hudi.common.model.HoodieKey)2 HoodieRecordPayload (org.apache.hudi.common.model.HoodieRecordPayload)2 BoundedInMemoryExecutor (org.apache.hudi.common.util.queue.BoundedInMemoryExecutor)2 HoodieMergeHandle (org.apache.hudi.io.HoodieMergeHandle)2