Search in sources :

Example 21 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class WikipediaSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    Map<String, Iterable<WorkUnitState>> previousWorkUnits = state.getPreviousWorkUnitStatesByDatasetUrns();
    List<String> titles = new LinkedList<>(Splitter.on(",").omitEmptyStrings().splitToList(state.getProp(WikipediaExtractor.SOURCE_PAGE_TITLES)));
    Map<String, LongWatermark> prevHighWatermarks = Maps.newHashMap();
    for (Map.Entry<String, Iterable<WorkUnitState>> entry : previousWorkUnits.entrySet()) {
        Iterable<LongWatermark> watermarks = Iterables.transform(entry.getValue(), new Function<WorkUnitState, LongWatermark>() {

            @Override
            public LongWatermark apply(WorkUnitState wus) {
                return wus.getActualHighWatermark(LongWatermark.class);
            }
        });
        watermarks = Iterables.filter(watermarks, Predicates.notNull());
        List<LongWatermark> watermarkList = Lists.newArrayList(watermarks);
        if (watermarkList.size() > 0) {
            prevHighWatermarks.put(entry.getKey(), Collections.max(watermarkList));
        }
    }
    Extract extract = createExtract(TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), "WikipediaOutput");
    List<WorkUnit> workUnits = Lists.newArrayList();
    for (String title : titles) {
        LongWatermark prevWatermark = prevHighWatermarks.containsKey(title) ? prevHighWatermarks.get(title) : new LongWatermark(-1);
        prevHighWatermarks.remove(title);
        WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1)));
        workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title);
        workUnits.add(workUnit);
    }
    for (Map.Entry<String, LongWatermark> nonProcessedDataset : prevHighWatermarks.entrySet()) {
        WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(), nonProcessedDataset.getValue()));
        workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
        workUnits.add(workUnit);
    }
    return workUnits;
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Extract(org.apache.gobblin.source.workunit.Extract) LinkedList(java.util.LinkedList) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Map(java.util.Map) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 22 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class HivePurgerPublisher method publishData.

@Override
public void publishData(Collection<? extends WorkUnitState> states) {
    for (WorkUnitState state : states) {
        if (state.getWorkingState() == WorkUnitState.WorkingState.SUCCESSFUL) {
            state.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
            submitEvent(state, ComplianceEvents.Purger.WORKUNIT_COMMITTED);
        } else {
            state.setWorkingState(WorkUnitState.WorkingState.FAILED);
            submitEvent(state, ComplianceEvents.Purger.WORKUNIT_FAILED);
        }
    }
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState)

Example 23 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class KafkaSimpleStreamingTest method getStreamingExtractor.

private KafkaSimpleStreamingExtractor<String, byte[]> getStreamingExtractor(String topic) {
    _kafkaTestHelper.provisionTopic(topic);
    List<WorkUnit> lWu = getWorkUnits(topic);
    WorkUnit wU = lWu.get(0);
    WorkUnitState wSU = new WorkUnitState(wU, new State());
    wSU.setProp(ConfigurationKeys.KAFKA_BROKERS, "localhost:" + _kafkaTestHelper.getKafkaServerPort());
    wSU.setProp(KafkaSimpleStreamingSource.TOPIC_WHITELIST, topic);
    wSU.setProp(ConfigurationKeys.JOB_NAME_KEY, topic);
    wSU.setProp(KafkaSimpleStreamingSource.TOPIC_KEY_DESERIALIZER, "org.apache.kafka.common.serialization.StringDeserializer");
    wSU.setProp(KafkaSimpleStreamingSource.TOPIC_VALUE_DESERIALIZER, "org.apache.kafka.common.serialization.ByteArrayDeserializer");
    // Create an extractor
    return new KafkaSimpleStreamingExtractor<String, byte[]>(wSU);
}
Also used : KafkaSimpleStreamingExtractor(org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleStreamingExtractor) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 24 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class KafkaDeserializerExtractorTest method testBuiltInStringDeserializer.

@Test
public void testBuiltInStringDeserializer() throws ReflectiveOperationException {
    WorkUnitState mockWorkUnitState = getMockWorkUnitState();
    mockWorkUnitState.setProp(KafkaDeserializerExtractor.KAFKA_DESERIALIZER_TYPE, KafkaDeserializerExtractor.Deserializers.STRING.name());
    KafkaDeserializerExtractor kafkaDecoderExtractor = new KafkaDeserializerExtractor(mockWorkUnitState);
    Assert.assertEquals(kafkaDecoderExtractor.getKafkaDeserializer().getClass(), KafkaDeserializerExtractor.Deserializers.STRING.getDeserializerClass());
    Assert.assertEquals(kafkaDecoderExtractor.getKafkaSchemaRegistry().getClass(), KafkaDeserializerExtractor.Deserializers.STRING.getSchemaRegistryClass());
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Test(org.testng.annotations.Test)

Example 25 with WorkUnitState

use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.

the class KafkaDeserializerExtractorTest method testConfluentAvroDeserializerForSchemaEvolution.

@Test
public void testConfluentAvroDeserializerForSchemaEvolution() throws IOException, RestClientException, SchemaRegistryException {
    WorkUnitState mockWorkUnitState = getMockWorkUnitState();
    mockWorkUnitState.setProp("schema.registry.url", TEST_URL);
    Schema schemaV1 = SchemaBuilder.record(TEST_RECORD_NAME).namespace(TEST_NAMESPACE).fields().name(TEST_FIELD_NAME).type().stringType().noDefault().endRecord();
    Schema schemaV2 = SchemaBuilder.record(TEST_RECORD_NAME).namespace(TEST_NAMESPACE).fields().name(TEST_FIELD_NAME).type().stringType().noDefault().optionalString(TEST_FIELD_NAME2).endRecord();
    GenericRecord testGenericRecord = new GenericRecordBuilder(schemaV1).set(TEST_FIELD_NAME, "testValue").build();
    SchemaRegistryClient mockSchemaRegistryClient = mock(SchemaRegistryClient.class);
    when(mockSchemaRegistryClient.getByID(any(Integer.class))).thenReturn(schemaV1);
    Serializer<Object> kafkaEncoder = new KafkaAvroSerializer(mockSchemaRegistryClient);
    Deserializer<Object> kafkaDecoder = new KafkaAvroDeserializer(mockSchemaRegistryClient);
    ByteBuffer testGenericRecordByteBuffer = ByteBuffer.wrap(kafkaEncoder.serialize(TEST_TOPIC_NAME, testGenericRecord));
    KafkaSchemaRegistry<Integer, Schema> mockKafkaSchemaRegistry = mock(KafkaSchemaRegistry.class);
    when(mockKafkaSchemaRegistry.getLatestSchemaByTopic(TEST_TOPIC_NAME)).thenReturn(schemaV2);
    KafkaDeserializerExtractor kafkaDecoderExtractor = new KafkaDeserializerExtractor(mockWorkUnitState, Optional.fromNullable(Deserializers.CONFLUENT_AVRO), kafkaDecoder, mockKafkaSchemaRegistry);
    when(kafkaDecoderExtractor.getSchema()).thenReturn(schemaV2);
    ByteArrayBasedKafkaRecord mockMessageAndOffset = getMockMessageAndOffset(testGenericRecordByteBuffer);
    GenericRecord received = (GenericRecord) kafkaDecoderExtractor.decodeRecord(mockMessageAndOffset);
    Assert.assertEquals(received.toString(), "{\"testField\": \"testValue\", \"testField2\": null}");
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Schema(org.apache.avro.Schema) KafkaAvroSerializer(io.confluent.kafka.serializers.KafkaAvroSerializer) KafkaAvroDeserializer(io.confluent.kafka.serializers.KafkaAvroDeserializer) ByteBuffer(java.nio.ByteBuffer) ByteArrayBasedKafkaRecord(org.apache.gobblin.kafka.client.ByteArrayBasedKafkaRecord) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) GenericRecord(org.apache.avro.generic.GenericRecord) SchemaRegistryClient(io.confluent.kafka.schemaregistry.client.SchemaRegistryClient) Test(org.testng.annotations.Test)

Aggregations

WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)222 Test (org.testng.annotations.Test)143 State (org.apache.gobblin.configuration.State)48 SourceState (org.apache.gobblin.configuration.SourceState)39 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)39 Schema (org.apache.avro.Schema)29 Path (org.apache.hadoop.fs.Path)26 GenericRecord (org.apache.avro.generic.GenericRecord)19 JsonObject (com.google.gson.JsonObject)17 ArrayList (java.util.ArrayList)16 File (java.io.File)14 TaskState (org.apache.hadoop.mapreduce.v2.api.records.TaskState)12 List (java.util.List)11 Configuration (org.apache.hadoop.conf.Configuration)11 IOException (java.io.IOException)10 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)10 Extract (org.apache.gobblin.source.workunit.Extract)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 Closer (com.google.common.io.Closer)8 JsonParser (com.google.gson.JsonParser)8