Search in sources :

Example 6 with RecordEnvelope

use of org.apache.gobblin.stream.RecordEnvelope in project incubator-gobblin by apache.

the class KafkaSimpleStreamingTest method testThreadedExtractor.

/**
 * testThreadedExtractor verifies its safe to call close from a different thread when the original thread is stuck in poll
 * We create a topic and then wait for the extractor to return a record (which it never does) in a side thread. The
 * original thread calls close on the extractor and verifies the waiting thread gets an expected exception and exits
 * as expected.
 */
@Test(timeOut = 10000)
public void testThreadedExtractor() {
    final String topic = "testThreadedExtractor";
    final KafkaSimpleStreamingExtractor<String, byte[]> kSSE = getStreamingExtractor(topic);
    Thread waitingThread = new Thread() {

        public void run() {
            TopicPartition tP = new TopicPartition(topic, 0);
            KafkaSimpleStreamingExtractor.KafkaWatermark kwm = new KafkaSimpleStreamingExtractor.KafkaWatermark(tP, new LongWatermark(0));
            byte[] reuse = new byte[1];
            RecordEnvelope<byte[]> oldRecord = new RecordEnvelope<>(reuse, kwm);
            try {
                RecordEnvelope<byte[]> record = kSSE.readRecordEnvelope();
            } catch (Exception e) {
                Assert.assertTrue((e instanceof WakeupException) || (e instanceof ClosedChannelException));
            }
        }
    };
    waitingThread.start();
    try {
        kSSE.close();
        waitingThread.join();
    } catch (Exception e) {
        // should never come here
        throw new Error(e);
    }
}
Also used : ClosedChannelException(java.nio.channels.ClosedChannelException) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) WakeupException(org.apache.kafka.common.errors.WakeupException) ClosedChannelException(java.nio.channels.ClosedChannelException) WakeupException(org.apache.kafka.common.errors.WakeupException) IOException(java.io.IOException) DataRecordException(org.apache.gobblin.source.extractor.DataRecordException) KafkaSimpleStreamingExtractor(org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleStreamingExtractor) TopicPartition(org.apache.kafka.common.TopicPartition) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 7 with RecordEnvelope

use of org.apache.gobblin.stream.RecordEnvelope in project incubator-gobblin by apache.

the class KafkaSimpleStreamingTest method testExtractor.

/**
 * testExtractor checks that the extractor code does the right thing. First it creates a topic, and sets up a source to point
 * to it. workUnits are generated from the source (only a single wU should be returned). Then it writes a record to this topic
 * and reads back from the extractor to verify the right record is returned. A second record is then written and read back
 * through the extractor to verify poll works as expected. Finally we test the commit api by forcing a commit and then starting
 * a new extractor to ensure we fetch data from after the commit. The commit is also verified in Kafka directly
 * @throws IOException
 * @throws InterruptedException
 * @throws DataRecordException
 */
@Test(timeOut = 10000)
public void testExtractor() throws IOException, InterruptedException, DataRecordException {
    final String topic = "testSimpleStreamingExtractor";
    _kafkaTestHelper.provisionTopic(topic);
    Properties props = new Properties();
    props.put("bootstrap.servers", "localhost:" + _kafkaTestHelper.getKafkaServerPort());
    props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
    props.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
    Producer<String, byte[]> producer = new KafkaProducer<>(props);
    final byte[] record_1 = { 0, 1, 3 };
    final byte[] record_2 = { 2, 4, 6 };
    final byte[] record_3 = { 5, 7, 9 };
    // Write a sample record to the topic
    producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_1));
    producer.flush();
    KafkaSimpleStreamingExtractor<String, byte[]> kSSE = getStreamingExtractor(topic);
    TopicPartition tP = new TopicPartition(topic, 0);
    KafkaSimpleStreamingExtractor.KafkaWatermark kwm = new KafkaSimpleStreamingExtractor.KafkaWatermark(tP, new LongWatermark(0));
    byte[] reuse = new byte[1];
    RecordEnvelope<byte[]> oldRecord = new RecordEnvelope<>(reuse, kwm);
    Map<String, CheckpointableWatermark> committedWatermarks = new HashMap<>();
    WatermarkStorage mockWatermarkStorage = mock(WatermarkStorage.class);
    when(mockWatermarkStorage.getCommittedWatermarks(any(Class.class), any(Iterable.class))).thenReturn(committedWatermarks);
    kSSE.start(mockWatermarkStorage);
    // read and verify the record matches we just wrote
    RecordEnvelope<byte[]> record = kSSE.readRecordEnvelope();
    Assert.assertEquals(record.getRecord(), record_1);
    // write a second record.
    producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_2));
    producer.flush();
    // read the second record using same extractor to verify it matches whats expected
    record = kSSE.readRecordEnvelope();
    Assert.assertEquals(record.getRecord(), record_2);
    // Commit the watermark
    committedWatermarks.put(record.getWatermark().getSource(), record.getWatermark());
    // write a third record.
    producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_3));
    producer.flush();
    // recreate extractor to force a seek.
    kSSE = getStreamingExtractor(topic);
    kSSE.start(mockWatermarkStorage);
    record = kSSE.readRecordEnvelope();
    // check it matches the data written
    Assert.assertEquals(record.getRecord(), record_3);
}
Also used : KafkaProducer(org.apache.kafka.clients.producer.KafkaProducer) WatermarkStorage(org.apache.gobblin.writer.WatermarkStorage) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) HashMap(java.util.HashMap) Properties(java.util.Properties) KafkaSimpleStreamingExtractor(org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleStreamingExtractor) TopicPartition(org.apache.kafka.common.TopicPartition) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 8 with RecordEnvelope

use of org.apache.gobblin.stream.RecordEnvelope in project incubator-gobblin by apache.

the class Fork method processRecord.

protected void processRecord(Object record) throws IOException, DataConversionException {
    if (this.forkState.compareAndSet(ForkState.FAILED, ForkState.FAILED)) {
        throw new IllegalStateException(String.format("Fork %d of task %s has failed and is no longer running", this.index, this.taskId));
    }
    if (record == null || record == SHUTDOWN_RECORD) {
        /**
         * null record indicates a timeout on record acquisition, SHUTDOWN_RECORD is sent during shutdown.
         * Will loop unless the parent task has indicated that it is already done pulling records.
         */
        if (this.parentTaskDone) {
            return;
        }
    } else {
        if (isStreamingMode()) {
            // Unpack the record from its container
            RecordEnvelope recordEnvelope = (RecordEnvelope) record;
            // Convert the record, check its data quality, and finally write it out if quality checking passes.
            for (Object convertedRecord : this.converter.convertRecord(this.convertedSchema, recordEnvelope.getRecord(), this.taskState)) {
                if (this.rowLevelPolicyChecker.executePolicies(convertedRecord, this.rowLevelPolicyCheckingResult)) {
                    // for each additional record we pass down, increment the acks needed
                    ((WatermarkAwareWriter) this.writer.get()).writeEnvelope(recordEnvelope.withRecord(convertedRecord));
                }
            }
            // ack this fork's processing done
            recordEnvelope.ack();
        } else {
            buildWriterIfNotPresent();
            // Convert the record, check its data quality, and finally write it out if quality checking passes.
            for (Object convertedRecord : this.converter.convertRecord(this.convertedSchema, record, this.taskState)) {
                if (this.rowLevelPolicyChecker.executePolicies(convertedRecord, this.rowLevelPolicyCheckingResult)) {
                    this.writer.get().writeEnvelope(new RecordEnvelope<>(convertedRecord));
                }
            }
        }
    }
}
Also used : WatermarkAwareWriter(org.apache.gobblin.writer.WatermarkAwareWriter) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope)

Example 9 with RecordEnvelope

use of org.apache.gobblin.stream.RecordEnvelope in project incubator-gobblin by apache.

the class Converter method processStream.

/**
 * Apply conversions to the input {@link RecordStreamWithMetadata}.
 */
@Override
public RecordStreamWithMetadata<DO, SO> processStream(RecordStreamWithMetadata<DI, SI> inputStream, WorkUnitState workUnitState) throws SchemaConversionException {
    init(workUnitState);
    this.outputGlobalMetadata = GlobalMetadata.<SI, SO>builderWithInput(inputStream.getGlobalMetadata(), Optional.fromNullable(convertSchema(inputStream.getGlobalMetadata().getSchema(), workUnitState))).build();
    Flowable<StreamEntity<DO>> outputStream = inputStream.getRecordStream().flatMap(in -> {
        if (in instanceof ControlMessage) {
            ControlMessage out = (ControlMessage) in;
            getMessageHandler().handleMessage((ControlMessage) in);
            // update the output schema with the new input schema from the MetadataUpdateControlMessage
            if (in instanceof MetadataUpdateControlMessage) {
                this.outputGlobalMetadata = GlobalMetadata.<SI, SO>builderWithInput(((MetadataUpdateControlMessage) in).getGlobalMetadata(), Optional.fromNullable(convertSchema((SI) ((MetadataUpdateControlMessage) in).getGlobalMetadata().getSchema(), workUnitState))).build();
                out = new MetadataUpdateControlMessage<SO, DO>(this.outputGlobalMetadata);
            }
            return Flowable.just(((ControlMessage<DO>) out));
        } else if (in instanceof RecordEnvelope) {
            RecordEnvelope<DI> recordEnvelope = (RecordEnvelope<DI>) in;
            Iterator<DO> convertedIterable = convertRecord(this.outputGlobalMetadata.getSchema(), recordEnvelope.getRecord(), workUnitState).iterator();
            if (!convertedIterable.hasNext()) {
                // if the iterable is empty, ack the record, return an empty flowable
                in.ack();
                return Flowable.empty();
            }
            DO firstRecord = convertedIterable.next();
            if (!convertedIterable.hasNext()) {
                // if the iterable has only one element, use RecordEnvelope.withRecord, which is more efficient
                return Flowable.just(recordEnvelope.withRecord(firstRecord));
            } else {
                // if the iterable has multiple records, use a ForkRecordBuilder
                RecordEnvelope<DI>.ForkRecordBuilder<DO> forkRecordBuilder = recordEnvelope.forkRecordBuilder();
                return Flowable.just(firstRecord).concatWith(Flowable.fromIterable(() -> convertedIterable)).map(forkRecordBuilder::childRecord).doOnComplete(forkRecordBuilder::close);
            }
        } else {
            throw new UnsupportedOperationException();
        }
    }, 1);
    outputStream = outputStream.doOnComplete(this::close);
    return inputStream.withRecordStream(outputStream, this.outputGlobalMetadata);
}
Also used : RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) StreamEntity(org.apache.gobblin.stream.StreamEntity) MetadataUpdateControlMessage(org.apache.gobblin.stream.MetadataUpdateControlMessage) Iterator(java.util.Iterator) MetadataUpdateControlMessage(org.apache.gobblin.stream.MetadataUpdateControlMessage) ControlMessage(org.apache.gobblin.stream.ControlMessage)

Example 10 with RecordEnvelope

use of org.apache.gobblin.stream.RecordEnvelope in project incubator-gobblin by apache.

the class ForkerTest method test.

@Test
public void test() throws Exception {
    Forker forker = new Forker();
    MyFlowable<StreamEntity<byte[]>> flowable = new MyFlowable<>();
    RecordStreamWithMetadata<byte[], String> stream = new RecordStreamWithMetadata<>(flowable, GlobalMetadata.<String>builder().schema("schema").build());
    WorkUnitState workUnitState = new WorkUnitState();
    workUnitState.setProp(ConfigurationKeys.FORK_BRANCHES_KEY, "3");
    Forker.ForkedStream<byte[], String> forkedStream = forker.forkStream(stream, new MyForkOperator(), workUnitState);
    Assert.assertEquals(forkedStream.getForkedStreams().size(), 3);
    Queue<StreamEntity<byte[]>> output0 = new LinkedList<>();
    forkedStream.getForkedStreams().get(0).getRecordStream().subscribe(output0::add);
    Queue<StreamEntity<byte[]>> output1 = new LinkedList<>();
    forkedStream.getForkedStreams().get(1).getRecordStream().subscribe(output1::add);
    Queue<StreamEntity<byte[]>> output2 = new LinkedList<>();
    forkedStream.getForkedStreams().get(2).getRecordStream().subscribe(output2::add);
    flowable._subscriber.onNext(new RecordEnvelope<>(new byte[] { 1, 1, 1 }));
    Assert.assertTrue(output0.poll() instanceof RecordEnvelope);
    Assert.assertTrue(output1.poll() instanceof RecordEnvelope);
    Assert.assertTrue(output2.poll() instanceof RecordEnvelope);
    flowable._subscriber.onNext(new RecordEnvelope<>(new byte[] { 1, 0, 0 }));
    Assert.assertTrue(output0.poll() instanceof RecordEnvelope);
    Assert.assertNull(output1.poll());
    Assert.assertNull(output2.poll());
    flowable._subscriber.onNext(new RecordEnvelope<>(new byte[] { 0, 1, 1 }));
    Assert.assertNull(output0.poll());
    Assert.assertTrue(output1.poll() instanceof RecordEnvelope);
    Assert.assertTrue(output2.poll() instanceof RecordEnvelope);
    flowable._subscriber.onNext(new BasicTestControlMessage<byte[]>("control"));
    Assert.assertTrue(output0.poll() instanceof BasicTestControlMessage);
    Assert.assertTrue(output1.poll() instanceof BasicTestControlMessage);
    Assert.assertTrue(output2.poll() instanceof BasicTestControlMessage);
    flowable._subscriber.onComplete();
}
Also used : RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) RecordStreamWithMetadata(org.apache.gobblin.records.RecordStreamWithMetadata) StreamEntity(org.apache.gobblin.stream.StreamEntity) LinkedList(java.util.LinkedList) BasicTestControlMessage(org.apache.gobblin.runtime.BasicTestControlMessage) Test(org.testng.annotations.Test)

Aggregations

RecordEnvelope (org.apache.gobblin.stream.RecordEnvelope)23 Test (org.testng.annotations.Test)13 State (org.apache.gobblin.configuration.State)7 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)7 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)5 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)4 RecordStreamWithMetadata (org.apache.gobblin.records.RecordStreamWithMetadata)4 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)4 FinalState (org.apache.gobblin.util.FinalState)4 CheckpointableWatermark (org.apache.gobblin.source.extractor.CheckpointableWatermark)3 List (java.util.List)2 Properties (java.util.Properties)2 BasicAckableForTesting (org.apache.gobblin.ack.BasicAckableForTesting)2 DataConversionException (org.apache.gobblin.converter.DataConversionException)2 NonTransientException (org.apache.gobblin.exception.NonTransientException)2 TaskPublisher (org.apache.gobblin.publisher.TaskPublisher)2 RowLevelPolicyCheckResults (org.apache.gobblin.qualitychecker.row.RowLevelPolicyCheckResults)2 RowLevelPolicyChecker (org.apache.gobblin.qualitychecker.row.RowLevelPolicyChecker)2 TaskLevelPolicyCheckResults (org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults)2