use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.
the class WikipediaSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
Map<String, Iterable<WorkUnitState>> previousWorkUnits = state.getPreviousWorkUnitStatesByDatasetUrns();
List<String> titles = new LinkedList<>(Splitter.on(",").omitEmptyStrings().splitToList(state.getProp(WikipediaExtractor.SOURCE_PAGE_TITLES)));
Map<String, LongWatermark> prevHighWatermarks = Maps.newHashMap();
for (Map.Entry<String, Iterable<WorkUnitState>> entry : previousWorkUnits.entrySet()) {
Iterable<LongWatermark> watermarks = Iterables.transform(entry.getValue(), new Function<WorkUnitState, LongWatermark>() {
@Override
public LongWatermark apply(WorkUnitState wus) {
return wus.getActualHighWatermark(LongWatermark.class);
}
});
watermarks = Iterables.filter(watermarks, Predicates.notNull());
List<LongWatermark> watermarkList = Lists.newArrayList(watermarks);
if (watermarkList.size() > 0) {
prevHighWatermarks.put(entry.getKey(), Collections.max(watermarkList));
}
}
Extract extract = createExtract(TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), "WikipediaOutput");
List<WorkUnit> workUnits = Lists.newArrayList();
for (String title : titles) {
LongWatermark prevWatermark = prevHighWatermarks.containsKey(title) ? prevHighWatermarks.get(title) : new LongWatermark(-1);
prevHighWatermarks.remove(title);
WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1)));
workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title);
workUnits.add(workUnit);
}
for (Map.Entry<String, LongWatermark> nonProcessedDataset : prevHighWatermarks.entrySet()) {
WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(), nonProcessedDataset.getValue()));
workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
workUnits.add(workUnit);
}
return workUnits;
}
use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.
the class HivePurgerPublisher method publishData.
@Override
public void publishData(Collection<? extends WorkUnitState> states) {
for (WorkUnitState state : states) {
if (state.getWorkingState() == WorkUnitState.WorkingState.SUCCESSFUL) {
state.setWorkingState(WorkUnitState.WorkingState.COMMITTED);
submitEvent(state, ComplianceEvents.Purger.WORKUNIT_COMMITTED);
} else {
state.setWorkingState(WorkUnitState.WorkingState.FAILED);
submitEvent(state, ComplianceEvents.Purger.WORKUNIT_FAILED);
}
}
}
use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.
the class KafkaSimpleStreamingTest method getStreamingExtractor.
private KafkaSimpleStreamingExtractor<String, byte[]> getStreamingExtractor(String topic) {
_kafkaTestHelper.provisionTopic(topic);
List<WorkUnit> lWu = getWorkUnits(topic);
WorkUnit wU = lWu.get(0);
WorkUnitState wSU = new WorkUnitState(wU, new State());
wSU.setProp(ConfigurationKeys.KAFKA_BROKERS, "localhost:" + _kafkaTestHelper.getKafkaServerPort());
wSU.setProp(KafkaSimpleStreamingSource.TOPIC_WHITELIST, topic);
wSU.setProp(ConfigurationKeys.JOB_NAME_KEY, topic);
wSU.setProp(KafkaSimpleStreamingSource.TOPIC_KEY_DESERIALIZER, "org.apache.kafka.common.serialization.StringDeserializer");
wSU.setProp(KafkaSimpleStreamingSource.TOPIC_VALUE_DESERIALIZER, "org.apache.kafka.common.serialization.ByteArrayDeserializer");
// Create an extractor
return new KafkaSimpleStreamingExtractor<String, byte[]>(wSU);
}
use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.
the class KafkaDeserializerExtractorTest method testBuiltInStringDeserializer.
@Test
public void testBuiltInStringDeserializer() throws ReflectiveOperationException {
WorkUnitState mockWorkUnitState = getMockWorkUnitState();
mockWorkUnitState.setProp(KafkaDeserializerExtractor.KAFKA_DESERIALIZER_TYPE, KafkaDeserializerExtractor.Deserializers.STRING.name());
KafkaDeserializerExtractor kafkaDecoderExtractor = new KafkaDeserializerExtractor(mockWorkUnitState);
Assert.assertEquals(kafkaDecoderExtractor.getKafkaDeserializer().getClass(), KafkaDeserializerExtractor.Deserializers.STRING.getDeserializerClass());
Assert.assertEquals(kafkaDecoderExtractor.getKafkaSchemaRegistry().getClass(), KafkaDeserializerExtractor.Deserializers.STRING.getSchemaRegistryClass());
}
use of org.apache.gobblin.configuration.WorkUnitState in project incubator-gobblin by apache.
the class KafkaDeserializerExtractorTest method testConfluentAvroDeserializerForSchemaEvolution.
@Test
public void testConfluentAvroDeserializerForSchemaEvolution() throws IOException, RestClientException, SchemaRegistryException {
WorkUnitState mockWorkUnitState = getMockWorkUnitState();
mockWorkUnitState.setProp("schema.registry.url", TEST_URL);
Schema schemaV1 = SchemaBuilder.record(TEST_RECORD_NAME).namespace(TEST_NAMESPACE).fields().name(TEST_FIELD_NAME).type().stringType().noDefault().endRecord();
Schema schemaV2 = SchemaBuilder.record(TEST_RECORD_NAME).namespace(TEST_NAMESPACE).fields().name(TEST_FIELD_NAME).type().stringType().noDefault().optionalString(TEST_FIELD_NAME2).endRecord();
GenericRecord testGenericRecord = new GenericRecordBuilder(schemaV1).set(TEST_FIELD_NAME, "testValue").build();
SchemaRegistryClient mockSchemaRegistryClient = mock(SchemaRegistryClient.class);
when(mockSchemaRegistryClient.getByID(any(Integer.class))).thenReturn(schemaV1);
Serializer<Object> kafkaEncoder = new KafkaAvroSerializer(mockSchemaRegistryClient);
Deserializer<Object> kafkaDecoder = new KafkaAvroDeserializer(mockSchemaRegistryClient);
ByteBuffer testGenericRecordByteBuffer = ByteBuffer.wrap(kafkaEncoder.serialize(TEST_TOPIC_NAME, testGenericRecord));
KafkaSchemaRegistry<Integer, Schema> mockKafkaSchemaRegistry = mock(KafkaSchemaRegistry.class);
when(mockKafkaSchemaRegistry.getLatestSchemaByTopic(TEST_TOPIC_NAME)).thenReturn(schemaV2);
KafkaDeserializerExtractor kafkaDecoderExtractor = new KafkaDeserializerExtractor(mockWorkUnitState, Optional.fromNullable(Deserializers.CONFLUENT_AVRO), kafkaDecoder, mockKafkaSchemaRegistry);
when(kafkaDecoderExtractor.getSchema()).thenReturn(schemaV2);
ByteArrayBasedKafkaRecord mockMessageAndOffset = getMockMessageAndOffset(testGenericRecordByteBuffer);
GenericRecord received = (GenericRecord) kafkaDecoderExtractor.decodeRecord(mockMessageAndOffset);
Assert.assertEquals(received.toString(), "{\"testField\": \"testValue\", \"testField2\": null}");
}
Aggregations