use of org.apache.gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator in project incubator-gobblin by apache.
the class HiveSource method createWorkunitsForPartitionedTable.
protected void createWorkunitsForPartitionedTable(HiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
boolean setLineageInfo = false;
long tableProcessTime = new DateTime().getMillis();
this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
Optional<String> partitionFilter = Optional.absent();
// If the table is date partitioned, use the partition name to filter partitions older than lookback
if (hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.PARTITION_COLUMN) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.DATETIME_FORMAT) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.LOOKBACK)) {
partitionFilter = Optional.of(new LookbackPartitionFilterGenerator(hiveDataset.getProperties()).getFilter(hiveDataset));
log.info(String.format("Getting partitions for %s using partition filter %s", hiveDataset.getTable().getCompleteName(), partitionFilter.get()));
}
List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), partitionFilter);
for (Partition sourcePartition : sourcePartitions) {
if (isOlderThanLookback(sourcePartition)) {
continue;
}
LongWatermark lowWatermark = watermarker.getPreviousHighWatermark(sourcePartition);
try {
if (!shouldCreateWorkUnit(new Path(sourcePartition.getLocation()))) {
log.info(String.format("Not creating workunit for partition %s as partition path %s contains data path tokens to ignore %s", sourcePartition.getCompleteName(), sourcePartition.getLocation(), this.ignoreDataPathIdentifierList));
continue;
}
long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
if (shouldCreateWorkunit(sourcePartition, lowWatermark)) {
log.debug(String.format("Processing partition: %s", sourcePartition));
long partitionProcessTime = new DateTime().getMillis();
this.watermarker.onPartitionProcessBegin(sourcePartition, partitionProcessTime, updateTime);
LongWatermark expectedPartitionHighWatermark = this.watermarker.getExpectedHighWatermark(sourcePartition, tableProcessTime, partitionProcessTime);
HiveWorkUnit hiveWorkUnit = workUnitForPartition(hiveDataset, sourcePartition);
hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedPartitionHighWatermark));
EventWorkunitUtils.setPartitionSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), sourcePartition, updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
if (hiveDataset instanceof ConvertibleHiveDataset && !setLineageInfo) {
setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
log.info("Added lineage event for dataset " + hiveDataset.getUrn());
// Add lineage information only once per hive table
setLineageInfo = true;
}
workunits.add(hiveWorkUnit);
log.info(String.format("Creating workunit for partition %s as updateTime %s is greater than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
} else {
// If watermark tracking at a partition level is necessary, create a dummy workunit for this partition here.
log.info(String.format("Not creating workunit for partition %s as updateTime %s is lesser than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
}
} catch (UpdateNotFoundException e) {
log.error(String.format("Not creating workunit for %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
} catch (SchemaNotFoundException e) {
log.error(String.format("Not creating workunit for %s as schema was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
} catch (UncheckedExecutionException e) {
log.error(String.format("Not creating workunit for %s because an unchecked exception occurred. %s", sourcePartition.getCompleteName(), e.getMessage()));
}
}
}
Aggregations