Search in sources :

Example 11 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class KafkaSimpleStreamingTest method testThreadedExtractor.

/**
 * testThreadedExtractor verifies its safe to call close from a different thread when the original thread is stuck in poll
 * We create a topic and then wait for the extractor to return a record (which it never does) in a side thread. The
 * original thread calls close on the extractor and verifies the waiting thread gets an expected exception and exits
 * as expected.
 */
@Test(timeOut = 10000)
public void testThreadedExtractor() {
    final String topic = "testThreadedExtractor";
    final KafkaSimpleStreamingExtractor<String, byte[]> kSSE = getStreamingExtractor(topic);
    Thread waitingThread = new Thread() {

        public void run() {
            TopicPartition tP = new TopicPartition(topic, 0);
            KafkaSimpleStreamingExtractor.KafkaWatermark kwm = new KafkaSimpleStreamingExtractor.KafkaWatermark(tP, new LongWatermark(0));
            byte[] reuse = new byte[1];
            RecordEnvelope<byte[]> oldRecord = new RecordEnvelope<>(reuse, kwm);
            try {
                RecordEnvelope<byte[]> record = kSSE.readRecordEnvelope();
            } catch (Exception e) {
                Assert.assertTrue((e instanceof WakeupException) || (e instanceof ClosedChannelException));
            }
        }
    };
    waitingThread.start();
    try {
        kSSE.close();
        waitingThread.join();
    } catch (Exception e) {
        // should never come here
        throw new Error(e);
    }
}
Also used : ClosedChannelException(java.nio.channels.ClosedChannelException) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) WakeupException(org.apache.kafka.common.errors.WakeupException) ClosedChannelException(java.nio.channels.ClosedChannelException) WakeupException(org.apache.kafka.common.errors.WakeupException) IOException(java.io.IOException) DataRecordException(org.apache.gobblin.source.extractor.DataRecordException) KafkaSimpleStreamingExtractor(org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleStreamingExtractor) TopicPartition(org.apache.kafka.common.TopicPartition) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 12 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class KafkaSimpleStreamingTest method testExtractor.

/**
 * testExtractor checks that the extractor code does the right thing. First it creates a topic, and sets up a source to point
 * to it. workUnits are generated from the source (only a single wU should be returned). Then it writes a record to this topic
 * and reads back from the extractor to verify the right record is returned. A second record is then written and read back
 * through the extractor to verify poll works as expected. Finally we test the commit api by forcing a commit and then starting
 * a new extractor to ensure we fetch data from after the commit. The commit is also verified in Kafka directly
 * @throws IOException
 * @throws InterruptedException
 * @throws DataRecordException
 */
@Test(timeOut = 10000)
public void testExtractor() throws IOException, InterruptedException, DataRecordException {
    final String topic = "testSimpleStreamingExtractor";
    _kafkaTestHelper.provisionTopic(topic);
    Properties props = new Properties();
    props.put("bootstrap.servers", "localhost:" + _kafkaTestHelper.getKafkaServerPort());
    props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
    props.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
    Producer<String, byte[]> producer = new KafkaProducer<>(props);
    final byte[] record_1 = { 0, 1, 3 };
    final byte[] record_2 = { 2, 4, 6 };
    final byte[] record_3 = { 5, 7, 9 };
    // Write a sample record to the topic
    producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_1));
    producer.flush();
    KafkaSimpleStreamingExtractor<String, byte[]> kSSE = getStreamingExtractor(topic);
    TopicPartition tP = new TopicPartition(topic, 0);
    KafkaSimpleStreamingExtractor.KafkaWatermark kwm = new KafkaSimpleStreamingExtractor.KafkaWatermark(tP, new LongWatermark(0));
    byte[] reuse = new byte[1];
    RecordEnvelope<byte[]> oldRecord = new RecordEnvelope<>(reuse, kwm);
    Map<String, CheckpointableWatermark> committedWatermarks = new HashMap<>();
    WatermarkStorage mockWatermarkStorage = mock(WatermarkStorage.class);
    when(mockWatermarkStorage.getCommittedWatermarks(any(Class.class), any(Iterable.class))).thenReturn(committedWatermarks);
    kSSE.start(mockWatermarkStorage);
    // read and verify the record matches we just wrote
    RecordEnvelope<byte[]> record = kSSE.readRecordEnvelope();
    Assert.assertEquals(record.getRecord(), record_1);
    // write a second record.
    producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_2));
    producer.flush();
    // read the second record using same extractor to verify it matches whats expected
    record = kSSE.readRecordEnvelope();
    Assert.assertEquals(record.getRecord(), record_2);
    // Commit the watermark
    committedWatermarks.put(record.getWatermark().getSource(), record.getWatermark());
    // write a third record.
    producer.send(new ProducerRecord<String, byte[]>(topic, topic, record_3));
    producer.flush();
    // recreate extractor to force a seek.
    kSSE = getStreamingExtractor(topic);
    kSSE.start(mockWatermarkStorage);
    record = kSSE.readRecordEnvelope();
    // check it matches the data written
    Assert.assertEquals(record.getRecord(), record_3);
}
Also used : KafkaProducer(org.apache.kafka.clients.producer.KafkaProducer) WatermarkStorage(org.apache.gobblin.writer.WatermarkStorage) RecordEnvelope(org.apache.gobblin.stream.RecordEnvelope) HashMap(java.util.HashMap) Properties(java.util.Properties) KafkaSimpleStreamingExtractor(org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleStreamingExtractor) TopicPartition(org.apache.kafka.common.TopicPartition) CheckpointableWatermark(org.apache.gobblin.source.extractor.CheckpointableWatermark) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 13 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class PartitionLevelWatermarkerTest method testNoPreviousWatermarkWorkunits.

@Test
public void testNoPreviousWatermarkWorkunits() throws Exception {
    // Create one previous workunit with IS_WATERMARK_WORKUNIT_KEY set to true
    WorkUnitState previousWus = new WorkUnitState();
    previousWus.setProp(ConfigurationKeys.DATASET_URN_KEY, "test_dataset_urn");
    previousWus.setProp(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY, true);
    previousWus.setActualHighWatermark(new MultiKeyValueLongWatermark(ImmutableMap.of("2015", 100l)));
    // Create one previous workunit with IS_WATERMARK_WORKUNIT_KEY not set (false)
    WorkUnitState previousWus2 = new WorkUnitState();
    previousWus2.setProp(ConfigurationKeys.DATASET_URN_KEY, "test_dataset_urn2");
    previousWus2.setActualHighWatermark(new LongWatermark(101l));
    SourceState state = new SourceState(new State(), Lists.newArrayList(previousWus, previousWus2));
    PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
    Assert.assertEquals(watermarker.getPreviousWatermarks().size(), 1);
    Assert.assertEquals(watermarker.getPreviousWatermarks().get("test_dataset_urn"), ImmutableMap.of("2015", 100l));
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 14 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class TableLevelWatermarkerTest method testPartitionWatermarks.

/**
 * Make sure that all partitions get the same previous high watermark (table's watermark)
 */
@Test
public void testPartitionWatermarks() throws Exception {
    WorkUnitState previousWus = new WorkUnitState();
    previousWus.setProp(ConfigurationKeys.DATASET_URN_KEY, "test_table");
    previousWus.setActualHighWatermark(new LongWatermark(100l));
    SourceState state = new SourceState(new State(), Lists.newArrayList(previousWus));
    TableLevelWatermarker watermarker = new TableLevelWatermarker(state);
    Table mockTable = mockTable("test_table");
    Assert.assertEquals(watermarker.getPreviousHighWatermark(mockTable), new LongWatermark(100l));
    Assert.assertEquals(watermarker.getPreviousHighWatermark(mockPartition(mockTable, ImmutableList.of("2015"))), new LongWatermark(100l));
    Assert.assertEquals(watermarker.getPreviousHighWatermark(mockPartition(mockTable, ImmutableList.of("2016"))), new LongWatermark(100l));
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Table(org.apache.hadoop.hive.ql.metadata.Table) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 15 with LongWatermark

use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.

the class BackfillHiveSourceTest method testWhitelist.

@Test
public void testWhitelist() throws Exception {
    BackfillHiveSource backfillHiveSource = new BackfillHiveSource();
    SourceState state = new SourceState();
    state.setProp(BackfillHiveSource.BACKFILL_SOURCE_PARTITION_WHITELIST_KEY, "service@logEvent@datepartition=2016-08-04-00,service@logEvent@datepartition=2016-08-05-00");
    backfillHiveSource.initBackfillHiveSource(state);
    Partition pass1 = Mockito.mock(Partition.class, Mockito.RETURNS_SMART_NULLS);
    Mockito.when(pass1.getCompleteName()).thenReturn("service@logEvent@datepartition=2016-08-04-00");
    Partition pass2 = Mockito.mock(Partition.class, Mockito.RETURNS_SMART_NULLS);
    Mockito.when(pass2.getCompleteName()).thenReturn("service@logEvent@datepartition=2016-08-05-00");
    Partition fail = Mockito.mock(Partition.class, Mockito.RETURNS_SMART_NULLS);
    Mockito.when(fail.getCompleteName()).thenReturn("service@logEvent@datepartition=2016-08-06-00");
    Assert.assertTrue(backfillHiveSource.shouldCreateWorkunit(pass1, new LongWatermark(0)));
    Assert.assertTrue(backfillHiveSource.shouldCreateWorkunit(pass2, new LongWatermark(0)));
    Assert.assertFalse(backfillHiveSource.shouldCreateWorkunit(fail, new LongWatermark(0)));
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) SourceState(org.apache.gobblin.configuration.SourceState) BackfillHiveSource(org.apache.gobblin.data.management.conversion.hive.source.BackfillHiveSource) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Aggregations

LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)35 Test (org.testng.annotations.Test)16 DefaultCheckpointableWatermark (org.apache.gobblin.source.extractor.DefaultCheckpointableWatermark)12 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)10 CheckpointableWatermark (org.apache.gobblin.source.extractor.CheckpointableWatermark)9 SourceState (org.apache.gobblin.configuration.SourceState)7 State (org.apache.gobblin.configuration.State)7 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)6 IOException (java.io.IOException)5 RecordEnvelope (org.apache.gobblin.stream.RecordEnvelope)5 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)4 Partition (org.apache.hadoop.hive.ql.metadata.Partition)4 Random (java.util.Random)3 TreeSet (java.util.TreeSet)3 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)3 ComparableWatermark (org.apache.gobblin.source.extractor.ComparableWatermark)3 Path (org.apache.hadoop.fs.Path)3 Benchmark (org.openjdk.jmh.annotations.Benchmark)3 Group (org.openjdk.jmh.annotations.Group)3 Config (com.typesafe.config.Config)2