use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.
the class SequentialTestSource method initialWorkUnits.
private List<WorkUnit> initialWorkUnits() {
List<WorkUnit> workUnits = Lists.newArrayList();
for (int i = 0; i < num_parallelism; i++) {
WorkUnit workUnit = WorkUnit.create(newExtract(Extract.TableType.APPEND_ONLY, namespace, table));
LongWatermark lowWatermark = new LongWatermark(i * numRecordsPerExtract + 1);
LongWatermark expectedHighWatermark = new LongWatermark((i + 1) * numRecordsPerExtract);
workUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedHighWatermark));
workUnit.setProp(WORK_UNIT_INDEX, i);
workUnits.add(workUnit);
}
return workUnits;
}
use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.
the class HiveSource method createWorkunitForNonPartitionedTable.
protected void createWorkunitForNonPartitionedTable(HiveDataset hiveDataset) throws IOException {
// Create workunits for tables
try {
long tableProcessTime = new DateTime().getMillis();
long updateTime = this.updateProvider.getUpdateTime(hiveDataset.getTable());
this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
LongWatermark lowWatermark = this.watermarker.getPreviousHighWatermark(hiveDataset.getTable());
if (!shouldCreateWorkUnit(hiveDataset.getTable().getPath())) {
log.info(String.format("Not creating workunit for table %s as partition path %s contains data path tokens to ignore %s", hiveDataset.getTable().getCompleteName(), hiveDataset.getTable().getPath(), this.ignoreDataPathIdentifierList));
return;
}
if (shouldCreateWorkunit(hiveDataset.getTable(), lowWatermark)) {
log.info(String.format("Creating workunit for table %s as updateTime %s or createTime %s is greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
HiveWorkUnit hiveWorkUnit = workUnitForTable(hiveDataset);
LongWatermark expectedDatasetHighWatermark = this.watermarker.getExpectedHighWatermark(hiveDataset.getTable(), tableProcessTime);
hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedDatasetHighWatermark));
EventWorkunitUtils.setTableSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
if (hiveDataset instanceof ConvertibleHiveDataset) {
setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
log.info("Added lineage event for dataset " + hiveDataset.getUrn());
}
this.workunits.add(hiveWorkUnit);
log.debug(String.format("Workunit added for table: %s", hiveWorkUnit));
} else {
log.info(String.format("Not creating workunit for table %s as updateTime %s and createTime %s is not greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
}
} catch (UpdateNotFoundException e) {
log.error(String.format("Not Creating workunit for %s as update time was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
} catch (SchemaNotFoundException e) {
log.error(String.format("Not Creating workunit for %s as schema was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
}
}
use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.
the class WikipediaSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
Map<String, Iterable<WorkUnitState>> previousWorkUnits = state.getPreviousWorkUnitStatesByDatasetUrns();
List<String> titles = new LinkedList<>(Splitter.on(",").omitEmptyStrings().splitToList(state.getProp(WikipediaExtractor.SOURCE_PAGE_TITLES)));
Map<String, LongWatermark> prevHighWatermarks = Maps.newHashMap();
for (Map.Entry<String, Iterable<WorkUnitState>> entry : previousWorkUnits.entrySet()) {
Iterable<LongWatermark> watermarks = Iterables.transform(entry.getValue(), new Function<WorkUnitState, LongWatermark>() {
@Override
public LongWatermark apply(WorkUnitState wus) {
return wus.getActualHighWatermark(LongWatermark.class);
}
});
watermarks = Iterables.filter(watermarks, Predicates.notNull());
List<LongWatermark> watermarkList = Lists.newArrayList(watermarks);
if (watermarkList.size() > 0) {
prevHighWatermarks.put(entry.getKey(), Collections.max(watermarkList));
}
}
Extract extract = createExtract(TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), "WikipediaOutput");
List<WorkUnit> workUnits = Lists.newArrayList();
for (String title : titles) {
LongWatermark prevWatermark = prevHighWatermarks.containsKey(title) ? prevHighWatermarks.get(title) : new LongWatermark(-1);
prevHighWatermarks.remove(title);
WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1)));
workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title);
workUnits.add(workUnit);
}
for (Map.Entry<String, LongWatermark> nonProcessedDataset : prevHighWatermarks.entrySet()) {
WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(), nonProcessedDataset.getValue()));
workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
workUnits.add(workUnit);
}
return workUnits;
}
use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.
the class HiveSource method createWorkunitsForPartitionedTable.
protected void createWorkunitsForPartitionedTable(HiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
boolean setLineageInfo = false;
long tableProcessTime = new DateTime().getMillis();
this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
Optional<String> partitionFilter = Optional.absent();
// If the table is date partitioned, use the partition name to filter partitions older than lookback
if (hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.PARTITION_COLUMN) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.DATETIME_FORMAT) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.LOOKBACK)) {
partitionFilter = Optional.of(new LookbackPartitionFilterGenerator(hiveDataset.getProperties()).getFilter(hiveDataset));
log.info(String.format("Getting partitions for %s using partition filter %s", hiveDataset.getTable().getCompleteName(), partitionFilter.get()));
}
List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), partitionFilter);
for (Partition sourcePartition : sourcePartitions) {
if (isOlderThanLookback(sourcePartition)) {
continue;
}
LongWatermark lowWatermark = watermarker.getPreviousHighWatermark(sourcePartition);
try {
if (!shouldCreateWorkUnit(new Path(sourcePartition.getLocation()))) {
log.info(String.format("Not creating workunit for partition %s as partition path %s contains data path tokens to ignore %s", sourcePartition.getCompleteName(), sourcePartition.getLocation(), this.ignoreDataPathIdentifierList));
continue;
}
long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
if (shouldCreateWorkunit(sourcePartition, lowWatermark)) {
log.debug(String.format("Processing partition: %s", sourcePartition));
long partitionProcessTime = new DateTime().getMillis();
this.watermarker.onPartitionProcessBegin(sourcePartition, partitionProcessTime, updateTime);
LongWatermark expectedPartitionHighWatermark = this.watermarker.getExpectedHighWatermark(sourcePartition, tableProcessTime, partitionProcessTime);
HiveWorkUnit hiveWorkUnit = workUnitForPartition(hiveDataset, sourcePartition);
hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedPartitionHighWatermark));
EventWorkunitUtils.setPartitionSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), sourcePartition, updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
if (hiveDataset instanceof ConvertibleHiveDataset && !setLineageInfo) {
setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
log.info("Added lineage event for dataset " + hiveDataset.getUrn());
// Add lineage information only once per hive table
setLineageInfo = true;
}
workunits.add(hiveWorkUnit);
log.info(String.format("Creating workunit for partition %s as updateTime %s is greater than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
} else {
// If watermark tracking at a partition level is necessary, create a dummy workunit for this partition here.
log.info(String.format("Not creating workunit for partition %s as updateTime %s is lesser than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
}
} catch (UpdateNotFoundException e) {
log.error(String.format("Not creating workunit for %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
} catch (SchemaNotFoundException e) {
log.error(String.format("Not creating workunit for %s as schema was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
} catch (UncheckedExecutionException e) {
log.error(String.format("Not creating workunit for %s because an unchecked exception occurred. %s", sourcePartition.getCompleteName(), e.getMessage()));
}
}
}
use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.
the class KafkaWorkUnitPacker method getWatermarkIntervalFromWorkUnit.
@SuppressWarnings("deprecation")
protected static WatermarkInterval getWatermarkIntervalFromWorkUnit(WorkUnit workUnit) {
if (workUnit instanceof MultiWorkUnit) {
return getWatermarkIntervalFromMultiWorkUnit((MultiWorkUnit) workUnit);
}
List<Long> lowWatermarkValues = Lists.newArrayList(workUnit.getLowWaterMark());
List<Long> expectedHighWatermarkValues = Lists.newArrayList(workUnit.getHighWaterMark());
return new WatermarkInterval(new MultiLongWatermark(lowWatermarkValues), new MultiLongWatermark(expectedHighWatermarkValues));
}
Aggregations