use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.
the class SequentialTestSource method getExtractor.
@Override
public Extractor<String, Object> getExtractor(WorkUnitState state) throws IOException {
Config config = ConfigFactory.parseProperties(state.getProperties());
configureIfNeeded(config);
final LongWatermark lowWatermark = state.getWorkunit().getLowWatermark(LongWatermark.class);
final WorkUnitState workUnitState = state;
final int index = state.getPropAsInt(WORK_UNIT_INDEX);
final TestBatchExtractor extractor = new TestBatchExtractor(index, lowWatermark, numRecordsPerExtract, sleepTimePerRecord, workUnitState);
if (!streaming) {
return extractor;
} else {
return (Extractor) new TestStreamingExtractor(extractor);
}
}
use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.
the class HiveSource method createWorkunitForNonPartitionedTable.
protected void createWorkunitForNonPartitionedTable(HiveDataset hiveDataset) throws IOException {
// Create workunits for tables
try {
long tableProcessTime = new DateTime().getMillis();
long updateTime = this.updateProvider.getUpdateTime(hiveDataset.getTable());
this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
LongWatermark lowWatermark = this.watermarker.getPreviousHighWatermark(hiveDataset.getTable());
if (!shouldCreateWorkUnit(hiveDataset.getTable().getPath())) {
log.info(String.format("Not creating workunit for table %s as partition path %s contains data path tokens to ignore %s", hiveDataset.getTable().getCompleteName(), hiveDataset.getTable().getPath(), this.ignoreDataPathIdentifierList));
return;
}
if (shouldCreateWorkunit(hiveDataset.getTable(), lowWatermark)) {
log.info(String.format("Creating workunit for table %s as updateTime %s or createTime %s is greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
HiveWorkUnit hiveWorkUnit = workUnitForTable(hiveDataset);
LongWatermark expectedDatasetHighWatermark = this.watermarker.getExpectedHighWatermark(hiveDataset.getTable(), tableProcessTime);
hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedDatasetHighWatermark));
EventWorkunitUtils.setTableSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
if (hiveDataset instanceof ConvertibleHiveDataset) {
setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
log.info("Added lineage event for dataset " + hiveDataset.getUrn());
}
this.workunits.add(hiveWorkUnit);
log.debug(String.format("Workunit added for table: %s", hiveWorkUnit));
} else {
log.info(String.format("Not creating workunit for table %s as updateTime %s and createTime %s is not greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
}
} catch (UpdateNotFoundException e) {
log.error(String.format("Not Creating workunit for %s as update time was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
} catch (SchemaNotFoundException e) {
log.error(String.format("Not Creating workunit for %s as schema was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
}
}
use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.
the class PartitionedWriterTest method testWatermarkComputation.
public void testWatermarkComputation(Long committed, Long unacknowledged, Long expected) throws IOException {
State state = new State();
state.setProp(ConfigurationKeys.WRITER_PARTITIONER_CLASS, TestPartitioner.class.getCanonicalName());
String defaultSource = "default";
WatermarkAwareWriter mockDataWriter = mock(WatermarkAwareWriter.class);
when(mockDataWriter.isWatermarkCapable()).thenReturn(true);
when(mockDataWriter.getCommittableWatermark()).thenReturn(Collections.singletonMap(defaultSource, new DefaultCheckpointableWatermark(defaultSource, new LongWatermark(committed))));
when(mockDataWriter.getUnacknowledgedWatermark()).thenReturn(Collections.singletonMap(defaultSource, new DefaultCheckpointableWatermark(defaultSource, new LongWatermark(unacknowledged))));
PartitionAwareDataWriterBuilder builder = mock(PartitionAwareDataWriterBuilder.class);
when(builder.validatePartitionSchema(any(Schema.class))).thenReturn(true);
when(builder.forPartition(any(GenericRecord.class))).thenReturn(builder);
when(builder.withWriterId(any(String.class))).thenReturn(builder);
when(builder.build()).thenReturn(mockDataWriter);
PartitionedDataWriter writer = new PartitionedDataWriter<String, String>(builder, state);
RecordEnvelope<String> recordEnvelope = new RecordEnvelope<String>("0");
recordEnvelope.addCallBack(new AcknowledgableWatermark(new DefaultCheckpointableWatermark(defaultSource, new LongWatermark(0))));
writer.writeEnvelope(recordEnvelope);
Map<String, CheckpointableWatermark> watermark = writer.getCommittableWatermark();
System.out.println(watermark.toString());
if (expected == null) {
Assert.assertTrue(watermark.isEmpty(), "Expected watermark to be absent");
} else {
Assert.assertTrue(watermark.size() == 1);
Assert.assertEquals((long) expected, ((LongWatermark) watermark.values().iterator().next().getWatermark()).getValue());
}
}
use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.
the class EventWorkunitUtils method setIsFirstPublishMetadata.
/**
* Sets metadata to indicate whether this is the first time this table or partition is being published.
* @param wus to set if this is first publish for this table or partition
*/
public static void setIsFirstPublishMetadata(WorkUnitState wus) {
if (!Boolean.valueOf(wus.getPropAsBoolean(IS_WATERMARK_WORKUNIT_KEY))) {
LongWatermark previousWatermark = wus.getWorkunit().getLowWatermark(LongWatermark.class);
wus.setProp(SlaEventKeys.IS_FIRST_PUBLISH, (null == previousWatermark || previousWatermark.getValue() == 0));
}
}
use of org.apache.gobblin.source.extractor.extract.LongWatermark in project incubator-gobblin by apache.
the class WikipediaSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
Map<String, Iterable<WorkUnitState>> previousWorkUnits = state.getPreviousWorkUnitStatesByDatasetUrns();
List<String> titles = new LinkedList<>(Splitter.on(",").omitEmptyStrings().splitToList(state.getProp(WikipediaExtractor.SOURCE_PAGE_TITLES)));
Map<String, LongWatermark> prevHighWatermarks = Maps.newHashMap();
for (Map.Entry<String, Iterable<WorkUnitState>> entry : previousWorkUnits.entrySet()) {
Iterable<LongWatermark> watermarks = Iterables.transform(entry.getValue(), new Function<WorkUnitState, LongWatermark>() {
@Override
public LongWatermark apply(WorkUnitState wus) {
return wus.getActualHighWatermark(LongWatermark.class);
}
});
watermarks = Iterables.filter(watermarks, Predicates.notNull());
List<LongWatermark> watermarkList = Lists.newArrayList(watermarks);
if (watermarkList.size() > 0) {
prevHighWatermarks.put(entry.getKey(), Collections.max(watermarkList));
}
}
Extract extract = createExtract(TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), "WikipediaOutput");
List<WorkUnit> workUnits = Lists.newArrayList();
for (String title : titles) {
LongWatermark prevWatermark = prevHighWatermarks.containsKey(title) ? prevHighWatermarks.get(title) : new LongWatermark(-1);
prevHighWatermarks.remove(title);
WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1)));
workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title);
workUnits.add(workUnit);
}
for (Map.Entry<String, LongWatermark> nonProcessedDataset : prevHighWatermarks.entrySet()) {
WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(), nonProcessedDataset.getValue()));
workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
workUnits.add(workUnit);
}
return workUnits;
}
Aggregations