Search in sources :

Example 1 with LookbackPartitionFilterGenerator

use of org.apache.gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator in project incubator-gobblin by apache.

the class HiveSource method createWorkunitsForPartitionedTable.

protected void createWorkunitsForPartitionedTable(HiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
    boolean setLineageInfo = false;
    long tableProcessTime = new DateTime().getMillis();
    this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
    Optional<String> partitionFilter = Optional.absent();
    // If the table is date partitioned, use the partition name to filter partitions older than lookback
    if (hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.PARTITION_COLUMN) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.DATETIME_FORMAT) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.LOOKBACK)) {
        partitionFilter = Optional.of(new LookbackPartitionFilterGenerator(hiveDataset.getProperties()).getFilter(hiveDataset));
        log.info(String.format("Getting partitions for %s using partition filter %s", hiveDataset.getTable().getCompleteName(), partitionFilter.get()));
    }
    List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), partitionFilter);
    for (Partition sourcePartition : sourcePartitions) {
        if (isOlderThanLookback(sourcePartition)) {
            continue;
        }
        LongWatermark lowWatermark = watermarker.getPreviousHighWatermark(sourcePartition);
        try {
            if (!shouldCreateWorkUnit(new Path(sourcePartition.getLocation()))) {
                log.info(String.format("Not creating workunit for partition %s as partition path %s contains data path tokens to ignore %s", sourcePartition.getCompleteName(), sourcePartition.getLocation(), this.ignoreDataPathIdentifierList));
                continue;
            }
            long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
            if (shouldCreateWorkunit(sourcePartition, lowWatermark)) {
                log.debug(String.format("Processing partition: %s", sourcePartition));
                long partitionProcessTime = new DateTime().getMillis();
                this.watermarker.onPartitionProcessBegin(sourcePartition, partitionProcessTime, updateTime);
                LongWatermark expectedPartitionHighWatermark = this.watermarker.getExpectedHighWatermark(sourcePartition, tableProcessTime, partitionProcessTime);
                HiveWorkUnit hiveWorkUnit = workUnitForPartition(hiveDataset, sourcePartition);
                hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedPartitionHighWatermark));
                EventWorkunitUtils.setPartitionSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), sourcePartition, updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
                if (hiveDataset instanceof ConvertibleHiveDataset && !setLineageInfo) {
                    setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
                    log.info("Added lineage event for dataset " + hiveDataset.getUrn());
                    // Add lineage information only once per hive table
                    setLineageInfo = true;
                }
                workunits.add(hiveWorkUnit);
                log.info(String.format("Creating workunit for partition %s as updateTime %s is greater than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            } else {
                // If watermark tracking at a partition level is necessary, create a dummy workunit for this partition here.
                log.info(String.format("Not creating workunit for partition %s as updateTime %s is lesser than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            }
        } catch (UpdateNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (SchemaNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as schema was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (UncheckedExecutionException e) {
            log.error(String.format("Not creating workunit for %s because an unchecked exception occurred. %s", sourcePartition.getCompleteName(), e.getMessage()));
        }
    }
}
Also used : LookbackPartitionFilterGenerator(org.apache.gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator) Path(org.apache.hadoop.fs.Path) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) DateTime(org.joda.time.DateTime) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) SchemaNotFoundException(org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Aggregations

UncheckedExecutionException (com.google.common.util.concurrent.UncheckedExecutionException)1 SchemaNotFoundException (org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException)1 ConvertibleHiveDataset (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset)1 UpdateNotFoundException (org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException)1 LookbackPartitionFilterGenerator (org.apache.gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator)1 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)1 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)1 Path (org.apache.hadoop.fs.Path)1 Partition (org.apache.hadoop.hive.ql.metadata.Partition)1 DateTime (org.joda.time.DateTime)1