use of org.apache.gobblin.stream.RecordEnvelope in project incubator-gobblin by apache.
the class KafkaSimpleStreamingTest method testThreadedExtractor.
/**
* testThreadedExtractor verifies its safe to call close from a different thread when the original thread is stuck in poll
* We create a topic and then wait for the extractor to return a record (which it never does) in a side thread. The
* original thread calls close on the extractor and verifies the waiting thread gets an expected exception and exits
* as expected.
*/
@Test(timeOut = 10000)
public void testThreadedExtractor() {
final String topic = "testThreadedExtractor";
final KafkaSimpleStreamingExtractor<String, byte[]> kSSE = getStreamingExtractor(topic);
Thread waitingThread = new Thread() {
public void run() {
TopicPartition tP = new TopicPartition(topic, 0);
KafkaSimpleStreamingExtractor.KafkaWatermark kwm = new KafkaSimpleStreamingExtractor.KafkaWatermark(tP, new LongWatermark(0));
byte[] reuse = new byte[1];
RecordEnvelope<byte[]> oldRecord = new RecordEnvelope<>(reuse, kwm);
try {
RecordEnvelope<byte[]> record = kSSE.readRecordEnvelope();
} catch (Exception e) {
Assert.assertTrue((e instanceof WakeupException) || (e instanceof ClosedChannelException));
}
}
};
waitingThread.start();
try {
kSSE.close();
waitingThread.join();
} catch (Exception e) {
// should never come here
throw new Error(e);
}
}
use of org.apache.gobblin.stream.RecordEnvelope in project incubator-gobblin by apache.
the class KafkaSimpleStreamingTest method testExtractor.
/**
* testExtractor checks that the extractor code does the right thing. First it creates a topic, and sets up a source to point
* to it. workUnits are generated from the source (only a single wU should be returned). Then it writes a record to this topic
* and reads back from the extractor to verify the right record is returned. A second record is then written and read back
* through the extractor to verify poll works as expected. Finally we test the commit api by forcing a commit and then starting
* a new extractor to ensure we fetch data from after the commit. The commit is also verified in Kafka directly
* @throws IOException
* @throws InterruptedException
* @throws DataRecordException
*/
@Test(timeOut = 10000)
public void testExtractor() throws IOException, InterruptedException, DataRecordException {
final String topic = "testSimpleStreamingExtractor";
_kafkaTestHelper.provisionTopic(topic);
Properties props = new Properties();
props.put("bootstrap.servers", "localhost:" + _kafkaTestHelper.getKafkaServerPort());
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
Producer<String, byte[]> producer = new KafkaProducer<>(props);
final byte[] record_1 = { 0, 1, 3 };
final byte[] record_2 = { 2, 4, 6 };
final byte[] record_3 = { 5, 7, 9 };
// Write a sample record to the topic
producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_1));
producer.flush();
KafkaSimpleStreamingExtractor<String, byte[]> kSSE = getStreamingExtractor(topic);
TopicPartition tP = new TopicPartition(topic, 0);
KafkaSimpleStreamingExtractor.KafkaWatermark kwm = new KafkaSimpleStreamingExtractor.KafkaWatermark(tP, new LongWatermark(0));
byte[] reuse = new byte[1];
RecordEnvelope<byte[]> oldRecord = new RecordEnvelope<>(reuse, kwm);
Map<String, CheckpointableWatermark> committedWatermarks = new HashMap<>();
WatermarkStorage mockWatermarkStorage = mock(WatermarkStorage.class);
when(mockWatermarkStorage.getCommittedWatermarks(any(Class.class), any(Iterable.class))).thenReturn(committedWatermarks);
kSSE.start(mockWatermarkStorage);
// read and verify the record matches we just wrote
RecordEnvelope<byte[]> record = kSSE.readRecordEnvelope();
Assert.assertEquals(record.getRecord(), record_1);
// write a second record.
producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_2));
producer.flush();
// read the second record using same extractor to verify it matches whats expected
record = kSSE.readRecordEnvelope();
Assert.assertEquals(record.getRecord(), record_2);
// Commit the watermark
committedWatermarks.put(record.getWatermark().getSource(), record.getWatermark());
// write a third record.
producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_3));
producer.flush();
// recreate extractor to force a seek.
kSSE = getStreamingExtractor(topic);
kSSE.start(mockWatermarkStorage);
record = kSSE.readRecordEnvelope();
// check it matches the data written
Assert.assertEquals(record.getRecord(), record_3);
}
use of org.apache.gobblin.stream.RecordEnvelope in project incubator-gobblin by apache.
the class Fork method processRecord.
protected void processRecord(Object record) throws IOException, DataConversionException {
if (this.forkState.compareAndSet(ForkState.FAILED, ForkState.FAILED)) {
throw new IllegalStateException(String.format("Fork %d of task %s has failed and is no longer running", this.index, this.taskId));
}
if (record == null || record == SHUTDOWN_RECORD) {
/**
* null record indicates a timeout on record acquisition, SHUTDOWN_RECORD is sent during shutdown.
* Will loop unless the parent task has indicated that it is already done pulling records.
*/
if (this.parentTaskDone) {
return;
}
} else {
if (isStreamingMode()) {
// Unpack the record from its container
RecordEnvelope recordEnvelope = (RecordEnvelope) record;
// Convert the record, check its data quality, and finally write it out if quality checking passes.
for (Object convertedRecord : this.converter.convertRecord(this.convertedSchema, recordEnvelope.getRecord(), this.taskState)) {
if (this.rowLevelPolicyChecker.executePolicies(convertedRecord, this.rowLevelPolicyCheckingResult)) {
// for each additional record we pass down, increment the acks needed
((WatermarkAwareWriter) this.writer.get()).writeEnvelope(recordEnvelope.withRecord(convertedRecord));
}
}
// ack this fork's processing done
recordEnvelope.ack();
} else {
buildWriterIfNotPresent();
// Convert the record, check its data quality, and finally write it out if quality checking passes.
for (Object convertedRecord : this.converter.convertRecord(this.convertedSchema, record, this.taskState)) {
if (this.rowLevelPolicyChecker.executePolicies(convertedRecord, this.rowLevelPolicyCheckingResult)) {
this.writer.get().writeEnvelope(new RecordEnvelope<>(convertedRecord));
}
}
}
}
}
use of org.apache.gobblin.stream.RecordEnvelope in project incubator-gobblin by apache.
the class Converter method processStream.
/**
* Apply conversions to the input {@link RecordStreamWithMetadata}.
*/
@Override
public RecordStreamWithMetadata<DO, SO> processStream(RecordStreamWithMetadata<DI, SI> inputStream, WorkUnitState workUnitState) throws SchemaConversionException {
init(workUnitState);
this.outputGlobalMetadata = GlobalMetadata.<SI, SO>builderWithInput(inputStream.getGlobalMetadata(), Optional.fromNullable(convertSchema(inputStream.getGlobalMetadata().getSchema(), workUnitState))).build();
Flowable<StreamEntity<DO>> outputStream = inputStream.getRecordStream().flatMap(in -> {
if (in instanceof ControlMessage) {
ControlMessage out = (ControlMessage) in;
getMessageHandler().handleMessage((ControlMessage) in);
// update the output schema with the new input schema from the MetadataUpdateControlMessage
if (in instanceof MetadataUpdateControlMessage) {
this.outputGlobalMetadata = GlobalMetadata.<SI, SO>builderWithInput(((MetadataUpdateControlMessage) in).getGlobalMetadata(), Optional.fromNullable(convertSchema((SI) ((MetadataUpdateControlMessage) in).getGlobalMetadata().getSchema(), workUnitState))).build();
out = new MetadataUpdateControlMessage<SO, DO>(this.outputGlobalMetadata);
}
return Flowable.just(((ControlMessage<DO>) out));
} else if (in instanceof RecordEnvelope) {
RecordEnvelope<DI> recordEnvelope = (RecordEnvelope<DI>) in;
Iterator<DO> convertedIterable = convertRecord(this.outputGlobalMetadata.getSchema(), recordEnvelope.getRecord(), workUnitState).iterator();
if (!convertedIterable.hasNext()) {
// if the iterable is empty, ack the record, return an empty flowable
in.ack();
return Flowable.empty();
}
DO firstRecord = convertedIterable.next();
if (!convertedIterable.hasNext()) {
// if the iterable has only one element, use RecordEnvelope.withRecord, which is more efficient
return Flowable.just(recordEnvelope.withRecord(firstRecord));
} else {
// if the iterable has multiple records, use a ForkRecordBuilder
RecordEnvelope<DI>.ForkRecordBuilder<DO> forkRecordBuilder = recordEnvelope.forkRecordBuilder();
return Flowable.just(firstRecord).concatWith(Flowable.fromIterable(() -> convertedIterable)).map(forkRecordBuilder::childRecord).doOnComplete(forkRecordBuilder::close);
}
} else {
throw new UnsupportedOperationException();
}
}, 1);
outputStream = outputStream.doOnComplete(this::close);
return inputStream.withRecordStream(outputStream, this.outputGlobalMetadata);
}
use of org.apache.gobblin.stream.RecordEnvelope in project incubator-gobblin by apache.
the class ForkerTest method test.
@Test
public void test() throws Exception {
Forker forker = new Forker();
MyFlowable<StreamEntity<byte[]>> flowable = new MyFlowable<>();
RecordStreamWithMetadata<byte[], String> stream = new RecordStreamWithMetadata<>(flowable, GlobalMetadata.<String>builder().schema("schema").build());
WorkUnitState workUnitState = new WorkUnitState();
workUnitState.setProp(ConfigurationKeys.FORK_BRANCHES_KEY, "3");
Forker.ForkedStream<byte[], String> forkedStream = forker.forkStream(stream, new MyForkOperator(), workUnitState);
Assert.assertEquals(forkedStream.getForkedStreams().size(), 3);
Queue<StreamEntity<byte[]>> output0 = new LinkedList<>();
forkedStream.getForkedStreams().get(0).getRecordStream().subscribe(output0::add);
Queue<StreamEntity<byte[]>> output1 = new LinkedList<>();
forkedStream.getForkedStreams().get(1).getRecordStream().subscribe(output1::add);
Queue<StreamEntity<byte[]>> output2 = new LinkedList<>();
forkedStream.getForkedStreams().get(2).getRecordStream().subscribe(output2::add);
flowable._subscriber.onNext(new RecordEnvelope<>(new byte[] { 1, 1, 1 }));
Assert.assertTrue(output0.poll() instanceof RecordEnvelope);
Assert.assertTrue(output1.poll() instanceof RecordEnvelope);
Assert.assertTrue(output2.poll() instanceof RecordEnvelope);
flowable._subscriber.onNext(new RecordEnvelope<>(new byte[] { 1, 0, 0 }));
Assert.assertTrue(output0.poll() instanceof RecordEnvelope);
Assert.assertNull(output1.poll());
Assert.assertNull(output2.poll());
flowable._subscriber.onNext(new RecordEnvelope<>(new byte[] { 0, 1, 1 }));
Assert.assertNull(output0.poll());
Assert.assertTrue(output1.poll() instanceof RecordEnvelope);
Assert.assertTrue(output2.poll() instanceof RecordEnvelope);
flowable._subscriber.onNext(new BasicTestControlMessage<byte[]>("control"));
Assert.assertTrue(output0.poll() instanceof BasicTestControlMessage);
Assert.assertTrue(output1.poll() instanceof BasicTestControlMessage);
Assert.assertTrue(output2.poll() instanceof BasicTestControlMessage);
flowable._subscriber.onComplete();
}
Aggregations