Search in sources :

Example 1 with WatermarkStorage

use of org.apache.gobblin.writer.WatermarkStorage in project incubator-gobblin by apache.

the class KafkaSimpleStreamingTest method testExtractor.

/**
 * testExtractor checks that the extractor code does the right thing. First it creates a topic, and sets up a source to point
 * to it. workUnits are generated from the source (only a single wU should be returned). Then it writes a record to this topic
 * and reads back from the extractor to verify the right record is returned. A second record is then written and read back
 * through the extractor to verify poll works as expected. Finally we test the commit api by forcing a commit and then starting
 * a new extractor to ensure we fetch data from after the commit. The commit is also verified in Kafka directly
 * @throws IOException
 * @throws InterruptedException
 * @throws DataRecordException
 */
@Test(timeOut = 10000)
public void testExtractor() throws IOException, InterruptedException, DataRecordException {
    final String topic = "testSimpleStreamingExtractor";
    _kafkaTestHelper.provisionTopic(topic);
    Properties props = new Properties();
    props.put("bootstrap.servers", "localhost:" + _kafkaTestHelper.getKafkaServerPort());
    props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
    props.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
    Producer<String, byte[]> producer = new KafkaProducer<>(props);
    final byte[] record_1 = { 0, 1, 3 };
    final byte[] record_2 = { 2, 4, 6 };
    final byte[] record_3 = { 5, 7, 9 };
    // Write a sample record to the topic
    producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_1));
    producer.flush();
    KafkaSimpleStreamingExtractor<String, byte[]> kSSE = getStreamingExtractor(topic);
    TopicPartition tP = new TopicPartition(topic, 0);
    KafkaSimpleStreamingExtractor.KafkaWatermark kwm = new KafkaSimpleStreamingExtractor.KafkaWatermark(tP, new LongWatermark(0));
    byte[] reuse = new byte[1];
    RecordEnvelope<byte[]> oldRecord = new RecordEnvelope<>(reuse, kwm);
    Map<String, CheckpointableWatermark> committedWatermarks = new HashMap<>();
    WatermarkStorage mockWatermarkStorage = mock(WatermarkStorage.class);
    when(mockWatermarkStorage.getCommittedWatermarks(any(Class.class), any(Iterable.class))).thenReturn(committedWatermarks);
    kSSE.start(mockWatermarkStorage);
    // read and verify the record matches we just wrote
    RecordEnvelope<byte[]> record = kSSE.readRecordEnvelope();
    Assert.assertEquals(record.getRecord(), record_1);
    // write a second record.
    producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_2));
    producer.flush();
    // read the second record using same extractor to verify it matches whats expected
    record = kSSE.readRecordEnvelope();
    Assert.assertEquals(record.getRecord(), record_2);
    // Commit the watermark
    committedWatermarks.put(record.getWatermark().getSource(), record.getWatermark());
    // write a third record.
    producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_3));
    producer.flush();
    // recreate extractor to force a seek.
    kSSE = getStreamingExtractor(topic);
    kSSE.start(mockWatermarkStorage);
    record = kSSE.readRecordEnvelope();
    // check it matches the data written
    Assert.assertEquals(record.getRecord(), record_3);
}
Also used : KafkaProducer(org.apache.kafka.clients.producer.KafkaProducer) WatermarkStorage(org.apache.gobblin.writer.WatermarkStorage) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) HashMap(java.util.HashMap) Properties(java.util.Properties) KafkaSimpleStreamingExtractor(org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleStreamingExtractor) TopicPartition(org.apache.kafka.common.TopicPartition) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 2 with WatermarkStorage

use of org.apache.gobblin.writer.WatermarkStorage in project incubator-gobblin by apache.

the class TaskContinuousTest method testContinuousTaskOneRecord.

/**
 * Test that a streaming task will work correctly when the extractor only produces one record
 * @throws Exception
 */
@Test
public void testContinuousTaskOneRecord() throws Exception {
    ArrayList<Object> recordCollector = new ArrayList<>(100);
    String testRecord = "hello";
    OneRecordExtractor oneRecordExtractor = new OneRecordExtractor(testRecord);
    TaskContext mockTaskContext = getMockTaskContext(recordCollector, oneRecordExtractor);
    // Create a mock TaskPublisher
    TaskPublisher mockTaskPublisher = mock(TaskPublisher.class);
    when(mockTaskPublisher.canPublish()).thenReturn(TaskPublisher.PublisherState.SUCCESS);
    when(mockTaskContext.getTaskPublisher(any(TaskState.class), any(TaskLevelPolicyCheckResults.class))).thenReturn(mockTaskPublisher);
    // Create a mock TaskStateTracker
    TaskStateTracker mockTaskStateTracker = mock(TaskStateTracker.class);
    // Create a TaskExecutor - a real TaskExecutor must be created so a Fork is run in a separate thread
    TaskExecutor taskExecutor = new TaskExecutor(new Properties());
    // Create the Task
    Task task = new Task(mockTaskContext, mockTaskStateTracker, taskExecutor, Optional.<CountDownLatch>absent());
    ScheduledExecutorService taskRunner = new ScheduledThreadPoolExecutor(1, ExecutorsUtils.newThreadFactory(Optional.of(log)));
    taskRunner.execute(task);
    // Let the task run for 2 seconds
    int sleepIterations = 2;
    int currentIteration = 0;
    WatermarkStorage mockWatermarkStorage = mockTaskContext.getWatermarkStorage();
    Map<String, CheckpointableWatermark> externalWatermarkStorage;
    while (currentIteration < sleepIterations) {
        Thread.sleep(1000);
        currentIteration++;
        externalWatermarkStorage = mockWatermarkStorage.getCommittedWatermarks(CheckpointableWatermark.class, ImmutableList.of("default"));
        if (!externalWatermarkStorage.isEmpty()) {
            for (CheckpointableWatermark watermark : externalWatermarkStorage.values()) {
                log.info("Observed committed watermark: {}", watermark);
            }
            log.info("Task progress: {}", task.getProgress());
            // Ensure that watermarks seem reasonable at each step
            Assert.assertTrue(oneRecordExtractor.validateWatermarks(false, externalWatermarkStorage));
        }
    }
    // Let's try to shutdown the task
    task.shutdown();
    log.info("Shutting down task now");
    boolean success = task.awaitShutdown(3000);
    Assert.assertTrue(success, "Task should shutdown in 3 seconds");
    log.info("Task done waiting to shutdown {}", success);
    externalWatermarkStorage = mockWatermarkStorage.getCommittedWatermarks(CheckpointableWatermark.class, ImmutableList.of("0"));
    // Ensure that committed watermarks match exactly the input rows because we shutdown in an orderly manner.
    Assert.assertTrue(oneRecordExtractor.validateWatermarks(true, externalWatermarkStorage));
    // Ensure that the record made it to the writer correctly
    Assert.assertEquals(recordCollector.size(), 1);
    Assert.assertEquals(recordCollector.get(0), testRecord);
    task.commit();
    // Shutdown the executor
    taskRunner.shutdown();
    taskRunner.awaitTermination(100, TimeUnit.MILLISECONDS);
}
Also used : WatermarkStorage(org.apache.gobblin.writer.WatermarkStorage) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) TaskPublisher(org.apache.gobblin.publisher.TaskPublisher) ScheduledThreadPoolExecutor(java.util.concurrent.ScheduledThreadPoolExecutor) ArrayList(java.util.ArrayList) TaskLevelPolicyCheckResults(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults) Properties(java.util.Properties) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) DefaultCheckpointableWatermark(org.apache.gobblin.source.extractor.DefaultCheckpointableWatermark) Test(org.testng.annotations.Test)

Example 3 with WatermarkStorage

use of org.apache.gobblin.writer.WatermarkStorage in project incubator-gobblin by apache.

the class TaskContinuousTest method getMockTaskContext.

private TaskContext getMockTaskContext(ArrayList<Object> recordCollector, Extractor mockExtractor) throws Exception {
    TaskState taskState = getStreamingTaskState();
    // Create a mock RowLevelPolicyChecker
    RowLevelPolicyChecker mockRowLevelPolicyChecker = new RowLevelPolicyChecker(Lists.newArrayList(), "stateId", FileSystem.getLocal(new Configuration()));
    WatermarkStorage mockWatermarkStorage = new MockWatermarkStorage();
    // Create a mock TaskPublisher
    TaskPublisher mockTaskPublisher = mock(TaskPublisher.class);
    when(mockTaskPublisher.canPublish()).thenReturn(TaskPublisher.PublisherState.SUCCESS);
    // Create a mock TaskContext
    TaskContext mockTaskContext = mock(TaskContext.class);
    when(mockTaskContext.getTaskMetrics()).thenReturn(TaskMetrics.get(taskState));
    when(mockTaskContext.getExtractor()).thenReturn(mockExtractor);
    when(mockTaskContext.getRawSourceExtractor()).thenReturn(mockExtractor);
    when(mockTaskContext.getWatermarkStorage()).thenReturn(mockWatermarkStorage);
    when(mockTaskContext.getForkOperator()).thenReturn(new IdentityForkOperator());
    when(mockTaskContext.getTaskState()).thenReturn(taskState);
    when(mockTaskContext.getTaskPublisher(any(TaskState.class), any(TaskLevelPolicyCheckResults.class))).thenReturn(mockTaskPublisher);
    when(mockTaskContext.getRowLevelPolicyChecker()).thenReturn(mockRowLevelPolicyChecker);
    when(mockTaskContext.getRowLevelPolicyChecker(anyInt())).thenReturn(mockRowLevelPolicyChecker);
    when(mockTaskContext.getTaskLevelPolicyChecker(any(TaskState.class), anyInt())).thenReturn(mock(TaskLevelPolicyChecker.class));
    when(mockTaskContext.getDataWriterBuilder(anyInt(), anyInt())).thenReturn(new TestStreamingDataWriterBuilder(recordCollector));
    return mockTaskContext;
}
Also used : IdentityForkOperator(org.apache.gobblin.fork.IdentityForkOperator) WatermarkStorage(org.apache.gobblin.writer.WatermarkStorage) TaskPublisher(org.apache.gobblin.publisher.TaskPublisher) Configuration(org.apache.hadoop.conf.Configuration) TaskLevelPolicyChecker(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyChecker) RowLevelPolicyChecker(org.apache.gobblin.qualitychecker.row.RowLevelPolicyChecker) TaskLevelPolicyCheckResults(org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults)

Aggregations

WatermarkStorage (org.apache.gobblin.writer.WatermarkStorage)3 Properties (java.util.Properties)2 TaskPublisher (org.apache.gobblin.publisher.TaskPublisher)2 TaskLevelPolicyCheckResults (org.apache.gobblin.qualitychecker.task.TaskLevelPolicyCheckResults)2 CheckpointableWatermark (org.apache.gobblin.source.extractor.CheckpointableWatermark)2 Test (org.testng.annotations.Test)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)1 ScheduledThreadPoolExecutor (java.util.concurrent.ScheduledThreadPoolExecutor)1 IdentityForkOperator (org.apache.gobblin.fork.IdentityForkOperator)1 RowLevelPolicyChecker (org.apache.gobblin.qualitychecker.row.RowLevelPolicyChecker)1 TaskLevelPolicyChecker (org.apache.gobblin.qualitychecker.task.TaskLevelPolicyChecker)1 DefaultCheckpointableWatermark (org.apache.gobblin.source.extractor.DefaultCheckpointableWatermark)1 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)1 KafkaSimpleStreamingExtractor (org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleStreamingExtractor)1 RecordEnvelope (org.apache.gobblin.stream.RecordEnvelope)1 Configuration (org.apache.hadoop.conf.Configuration)1 KafkaProducer (org.apache.kafka.clients.producer.KafkaProducer)1 TopicPartition (org.apache.kafka.common.TopicPartition)1