Search in sources :

Example 11 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class KafkaSimpleStreamingTest method getWorkUnits.

private List<WorkUnit> getWorkUnits(String topic) {
    SourceState ss = new SourceState();
    ss.setProp(ConfigurationKeys.KAFKA_BROKERS, "localhost:" + _kafkaTestHelper.getKafkaServerPort());
    ss.setProp(KafkaSimpleStreamingSource.TOPIC_WHITELIST, topic);
    ss.setProp(ConfigurationKeys.JOB_NAME_KEY, topic);
    ss.setProp(KafkaSimpleStreamingSource.TOPIC_KEY_DESERIALIZER, "org.apache.kafka.common.serialization.StringDeserializer");
    ss.setProp(KafkaSimpleStreamingSource.TOPIC_VALUE_DESERIALIZER, "org.apache.kafka.common.serialization.ByteArrayDeserializer");
    KafkaSimpleStreamingSource<String, byte[]> simpleSource = new KafkaSimpleStreamingSource<String, byte[]>();
    return simpleSource.getWorkunits(ss);
}
Also used : KafkaSimpleStreamingSource(org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleStreamingSource) SourceState(org.apache.gobblin.configuration.SourceState)

Example 12 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class Kafka09JsonIntegrationTest method testHappyPath.

@Test
public void testHappyPath() throws IOException, DataRecordException {
    String topic = "testKafka09JsonSource";
    kafkaTestHelper.provisionTopic(topic);
    SourceState state = createSourceState(topic);
    // Produce a record
    state.setProp(KAFKA_PRODUCER_CONFIG_PREFIX + "bootstrap.servers", "localhost:" + kafkaTestHelper.getKafkaServerPort());
    state.setProp(KAFKA_TOPIC, topic);
    Destination destination = Destination.of(Destination.DestinationType.KAFKA, state);
    Kafka09JsonObjectWriterBuilder writerBuilder = new Kafka09JsonObjectWriterBuilder();
    writerBuilder.writeTo(destination);
    DataWriter<JsonObject> writer = writerBuilder.build();
    final String json = "{\"number\":27}";
    JsonObject record = gson.fromJson(json, JsonObject.class);
    writer.write(record);
    writer.flush();
    writer.close();
    Kafka09JsonSource source = new Kafka09JsonSource();
    List<WorkUnit> workUnitList = source.getWorkunits(state);
    // Test the right value serializer is set
    Assert.assertEquals(state.getProp(Kafka09ConsumerClient.GOBBLIN_CONFIG_VALUE_DESERIALIZER_CLASS_KEY), Kafka09JsonSource.KafkaGsonDeserializer.class.getName());
    // Test there is only one non-empty work unit
    MultiWorkUnitUnpackingIterator iterator = new MultiWorkUnitUnpackingIterator(workUnitList.iterator());
    Assert.assertTrue(iterator.hasNext());
    WorkUnit workUnit = iterator.next();
    Assert.assertEquals(workUnit.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY), topic);
    Assert.assertFalse(iterator.hasNext());
    // Test extractor
    WorkUnitState workUnitState = new WorkUnitState(workUnit, state);
    final String jsonSchema = "[{\"columnName\":\"number\",\"comment\":\"\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"int\"}}]";
    workUnitState.setProp("source.kafka.json.schema", jsonSchema);
    Extractor<JsonArray, JsonObject> extractor = source.getExtractor(workUnitState);
    Assert.assertEquals(extractor.getSchema().toString(), jsonSchema);
    Assert.assertEquals(extractor.readRecord(null).toString(), json);
}
Also used : Destination(org.apache.gobblin.writer.Destination) SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) JsonObject(com.google.gson.JsonObject) JsonArray(com.google.gson.JsonArray) Kafka09JsonSource(org.apache.gobblin.source.extractor.extract.kafka.Kafka09JsonSource) MultiWorkUnitUnpackingIterator(org.apache.gobblin.runtime.util.MultiWorkUnitUnpackingIterator) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Kafka09JsonObjectWriterBuilder(org.apache.gobblin.kafka.writer.Kafka09JsonObjectWriterBuilder) Test(org.testng.annotations.Test)

Example 13 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class PartitionLevelWatermarkerTest method testNoPreviousWatermarkWorkunits.

@Test
public void testNoPreviousWatermarkWorkunits() throws Exception {
    // Create one previous workunit with IS_WATERMARK_WORKUNIT_KEY set to true
    WorkUnitState previousWus = new WorkUnitState();
    previousWus.setProp(ConfigurationKeys.DATASET_URN_KEY, "test_dataset_urn");
    previousWus.setProp(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY, true);
    previousWus.setActualHighWatermark(new MultiKeyValueLongWatermark(ImmutableMap.of("2015", 100l)));
    // Create one previous workunit with IS_WATERMARK_WORKUNIT_KEY not set (false)
    WorkUnitState previousWus2 = new WorkUnitState();
    previousWus2.setProp(ConfigurationKeys.DATASET_URN_KEY, "test_dataset_urn2");
    previousWus2.setActualHighWatermark(new LongWatermark(101l));
    SourceState state = new SourceState(new State(), Lists.newArrayList(previousWus, previousWus2));
    PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
    Assert.assertEquals(watermarker.getPreviousWatermarks().size(), 1);
    Assert.assertEquals(watermarker.getPreviousWatermarks().get("test_dataset_urn"), ImmutableMap.of("2015", 100l));
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark) Test(org.testng.annotations.Test)

Example 14 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class PartitionLevelWatermarkerTest method testStateStoreReadWrite.

@Test
public void testStateStoreReadWrite() throws Exception {
    String dbName = "testStateStoreReadWrite";
    LocalHiveMetastoreTestUtils.getInstance().dropDatabaseIfExists(dbName);
    PartitionLevelWatermarker watermarker0 = new PartitionLevelWatermarker(new SourceState());
    Table mockTable = localTestTable(dbName, "table1", true);
    watermarker0.onTableProcessBegin(mockTable, 0l);
    long now = new DateTime().getMillis();
    watermarker0.onPartitionProcessBegin(localTestPartition(mockTable, ImmutableList.of("2016")), 0, now);
    List<WorkUnit> workunits = Lists.newArrayList();
    watermarker0.onGetWorkunitsEnd(workunits);
    @SuppressWarnings("deprecation") WorkUnitState previousWus = new WorkUnitState(workunits.get(0));
    watermarker0.setActualHighWatermark(previousWus);
    SourceState state = new SourceState(new State(), Lists.newArrayList(previousWus));
    PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
    Assert.assertEquals(watermarker.getPreviousWatermarks().size(), 1);
    Assert.assertEquals(watermarker.getPreviousWatermarks().get(dbName + "@table1"), ImmutableMap.of("2016", now));
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Table(org.apache.hadoop.hive.ql.metadata.Table) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) DateTime(org.joda.time.DateTime) Test(org.testng.annotations.Test)

Example 15 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class PartitionLevelWatermarkerTest method testDroppedPartitions.

@Test
public void testDroppedPartitions() throws Exception {
    WorkUnitState previousWus = new WorkUnitState();
    previousWus.setProp(ConfigurationKeys.DATASET_URN_KEY, "db@test_dataset_urn");
    previousWus.setProp(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY, true);
    previousWus.setActualHighWatermark(new MultiKeyValueLongWatermark(ImmutableMap.of("2015-01", 100l, "2015-02", 101l)));
    SourceState state = new SourceState(new State(), Lists.newArrayList(previousWus));
    PartitionLevelWatermarker watermarker = new PartitionLevelWatermarker(state);
    Table table = mockTable("test_dataset_urn");
    Mockito.when(table.getPartitionKeys()).thenReturn(ImmutableList.of(new FieldSchema("year", "string", "")));
    Partition partition2015 = mockPartition(table, ImmutableList.of("2015"));
    // partition 2015 replaces 2015-01 and 2015-02
    Mockito.when(partition2015.getParameters()).thenReturn(ImmutableMap.of(AbstractAvroToOrcConverter.REPLACED_PARTITIONS_HIVE_METASTORE_KEY, "2015-01|2015-02"));
    watermarker.onPartitionProcessBegin(partition2015, 0l, 0l);
    Assert.assertEquals(watermarker.getExpectedHighWatermarks().get("db@test_dataset_urn"), ImmutableMap.of("2015", 0l));
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) SourceState(org.apache.gobblin.configuration.SourceState) Table(org.apache.hadoop.hive.ql.metadata.Table) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) Test(org.testng.annotations.Test)

Aggregations

SourceState (org.apache.gobblin.configuration.SourceState)90 Test (org.testng.annotations.Test)76 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)44 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)38 State (org.apache.gobblin.configuration.State)30 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)11 Partition (org.apache.hadoop.hive.ql.metadata.Partition)8 Table (org.apache.hadoop.hive.ql.metadata.Table)8 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)7 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 Extract (org.apache.gobblin.source.workunit.Extract)7 DateTime (org.joda.time.DateTime)7 Dataset (org.apache.gobblin.dataset.Dataset)6 PartitionableDataset (org.apache.gobblin.dataset.PartitionableDataset)6 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6 IOException (java.io.IOException)5 Path (org.apache.hadoop.fs.Path)5 Gson (com.google.gson.Gson)4 JsonObject (com.google.gson.JsonObject)4