Search in sources :

Example 1 with WatermarkInterval

use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.

the class SequentialTestSource method initialWorkUnits.

private List<WorkUnit> initialWorkUnits() {
    List<WorkUnit> workUnits = Lists.newArrayList();
    for (int i = 0; i < num_parallelism; i++) {
        WorkUnit workUnit = WorkUnit.create(newExtract(Extract.TableType.APPEND_ONLY, namespace, table));
        LongWatermark lowWatermark = new LongWatermark(i * numRecordsPerExtract + 1);
        LongWatermark expectedHighWatermark = new LongWatermark((i + 1) * numRecordsPerExtract);
        workUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedHighWatermark));
        workUnit.setProp(WORK_UNIT_INDEX, i);
        workUnits.add(workUnit);
    }
    return workUnits;
}
Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 2 with WatermarkInterval

use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.

the class HiveSource method createWorkunitForNonPartitionedTable.

protected void createWorkunitForNonPartitionedTable(HiveDataset hiveDataset) throws IOException {
    // Create workunits for tables
    try {
        long tableProcessTime = new DateTime().getMillis();
        long updateTime = this.updateProvider.getUpdateTime(hiveDataset.getTable());
        this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
        LongWatermark lowWatermark = this.watermarker.getPreviousHighWatermark(hiveDataset.getTable());
        if (!shouldCreateWorkUnit(hiveDataset.getTable().getPath())) {
            log.info(String.format("Not creating workunit for table %s as partition path %s contains data path tokens to ignore %s", hiveDataset.getTable().getCompleteName(), hiveDataset.getTable().getPath(), this.ignoreDataPathIdentifierList));
            return;
        }
        if (shouldCreateWorkunit(hiveDataset.getTable(), lowWatermark)) {
            log.info(String.format("Creating workunit for table %s as updateTime %s or createTime %s is greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
            HiveWorkUnit hiveWorkUnit = workUnitForTable(hiveDataset);
            LongWatermark expectedDatasetHighWatermark = this.watermarker.getExpectedHighWatermark(hiveDataset.getTable(), tableProcessTime);
            hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedDatasetHighWatermark));
            EventWorkunitUtils.setTableSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
            if (hiveDataset instanceof ConvertibleHiveDataset) {
                setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
                log.info("Added lineage event for dataset " + hiveDataset.getUrn());
            }
            this.workunits.add(hiveWorkUnit);
            log.debug(String.format("Workunit added for table: %s", hiveWorkUnit));
        } else {
            log.info(String.format("Not creating workunit for table %s as updateTime %s and createTime %s is not greater than low watermark %s", hiveDataset.getTable().getCompleteName(), updateTime, hiveDataset.getTable().getTTable().getCreateTime(), lowWatermark.getValue()));
        }
    } catch (UpdateNotFoundException e) {
        log.error(String.format("Not Creating workunit for %s as update time was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
    } catch (SchemaNotFoundException e) {
        log.error(String.format("Not Creating workunit for %s as schema was not found. %s", hiveDataset.getTable().getCompleteName(), e.getMessage()), e);
    }
}
Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) SchemaNotFoundException(org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException) DateTime(org.joda.time.DateTime) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 3 with WatermarkInterval

use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.

the class WikipediaSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    Map<String, Iterable<WorkUnitState>> previousWorkUnits = state.getPreviousWorkUnitStatesByDatasetUrns();
    List<String> titles = new LinkedList<>(Splitter.on(",").omitEmptyStrings().splitToList(state.getProp(WikipediaExtractor.SOURCE_PAGE_TITLES)));
    Map<String, LongWatermark> prevHighWatermarks = Maps.newHashMap();
    for (Map.Entry<String, Iterable<WorkUnitState>> entry : previousWorkUnits.entrySet()) {
        Iterable<LongWatermark> watermarks = Iterables.transform(entry.getValue(), new Function<WorkUnitState, LongWatermark>() {

            @Override
            public LongWatermark apply(WorkUnitState wus) {
                return wus.getActualHighWatermark(LongWatermark.class);
            }
        });
        watermarks = Iterables.filter(watermarks, Predicates.notNull());
        List<LongWatermark> watermarkList = Lists.newArrayList(watermarks);
        if (watermarkList.size() > 0) {
            prevHighWatermarks.put(entry.getKey(), Collections.max(watermarkList));
        }
    }
    Extract extract = createExtract(TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), "WikipediaOutput");
    List<WorkUnit> workUnits = Lists.newArrayList();
    for (String title : titles) {
        LongWatermark prevWatermark = prevHighWatermarks.containsKey(title) ? prevHighWatermarks.get(title) : new LongWatermark(-1);
        prevHighWatermarks.remove(title);
        WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1)));
        workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title);
        workUnits.add(workUnit);
    }
    for (Map.Entry<String, LongWatermark> nonProcessedDataset : prevHighWatermarks.entrySet()) {
        WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(), nonProcessedDataset.getValue()));
        workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
        workUnits.add(workUnit);
    }
    return workUnits;
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Extract(org.apache.gobblin.source.workunit.Extract) LinkedList(java.util.LinkedList) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Map(java.util.Map) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 4 with WatermarkInterval

use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.

the class HiveSource method createWorkunitsForPartitionedTable.

protected void createWorkunitsForPartitionedTable(HiveDataset hiveDataset, AutoReturnableObject<IMetaStoreClient> client) throws IOException {
    boolean setLineageInfo = false;
    long tableProcessTime = new DateTime().getMillis();
    this.watermarker.onTableProcessBegin(hiveDataset.getTable(), tableProcessTime);
    Optional<String> partitionFilter = Optional.absent();
    // If the table is date partitioned, use the partition name to filter partitions older than lookback
    if (hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.PARTITION_COLUMN) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.DATETIME_FORMAT) && hiveDataset.getProperties().containsKey(LookbackPartitionFilterGenerator.LOOKBACK)) {
        partitionFilter = Optional.of(new LookbackPartitionFilterGenerator(hiveDataset.getProperties()).getFilter(hiveDataset));
        log.info(String.format("Getting partitions for %s using partition filter %s", hiveDataset.getTable().getCompleteName(), partitionFilter.get()));
    }
    List<Partition> sourcePartitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), partitionFilter);
    for (Partition sourcePartition : sourcePartitions) {
        if (isOlderThanLookback(sourcePartition)) {
            continue;
        }
        LongWatermark lowWatermark = watermarker.getPreviousHighWatermark(sourcePartition);
        try {
            if (!shouldCreateWorkUnit(new Path(sourcePartition.getLocation()))) {
                log.info(String.format("Not creating workunit for partition %s as partition path %s contains data path tokens to ignore %s", sourcePartition.getCompleteName(), sourcePartition.getLocation(), this.ignoreDataPathIdentifierList));
                continue;
            }
            long updateTime = this.updateProvider.getUpdateTime(sourcePartition);
            if (shouldCreateWorkunit(sourcePartition, lowWatermark)) {
                log.debug(String.format("Processing partition: %s", sourcePartition));
                long partitionProcessTime = new DateTime().getMillis();
                this.watermarker.onPartitionProcessBegin(sourcePartition, partitionProcessTime, updateTime);
                LongWatermark expectedPartitionHighWatermark = this.watermarker.getExpectedHighWatermark(sourcePartition, tableProcessTime, partitionProcessTime);
                HiveWorkUnit hiveWorkUnit = workUnitForPartition(hiveDataset, sourcePartition);
                hiveWorkUnit.setWatermarkInterval(new WatermarkInterval(lowWatermark, expectedPartitionHighWatermark));
                EventWorkunitUtils.setPartitionSlaEventMetadata(hiveWorkUnit, hiveDataset.getTable(), sourcePartition, updateTime, lowWatermark.getValue(), this.beginGetWorkunitsTime);
                if (hiveDataset instanceof ConvertibleHiveDataset && !setLineageInfo) {
                    setLineageInfo((ConvertibleHiveDataset) hiveDataset, hiveWorkUnit, this.sharedJobBroker);
                    log.info("Added lineage event for dataset " + hiveDataset.getUrn());
                    // Add lineage information only once per hive table
                    setLineageInfo = true;
                }
                workunits.add(hiveWorkUnit);
                log.info(String.format("Creating workunit for partition %s as updateTime %s is greater than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            } else {
                // If watermark tracking at a partition level is necessary, create a dummy workunit for this partition here.
                log.info(String.format("Not creating workunit for partition %s as updateTime %s is lesser than low watermark %s", sourcePartition.getCompleteName(), updateTime, lowWatermark.getValue()));
            }
        } catch (UpdateNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as update time was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (SchemaNotFoundException e) {
            log.error(String.format("Not creating workunit for %s as schema was not found. %s", sourcePartition.getCompleteName(), e.getMessage()));
        } catch (UncheckedExecutionException e) {
            log.error(String.format("Not creating workunit for %s because an unchecked exception occurred. %s", sourcePartition.getCompleteName(), e.getMessage()));
        }
    }
}
Also used : LookbackPartitionFilterGenerator(org.apache.gobblin.data.management.copy.hive.filter.LookbackPartitionFilterGenerator) Path(org.apache.hadoop.fs.Path) UpdateNotFoundException(org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException) Partition(org.apache.hadoop.hive.ql.metadata.Partition) UncheckedExecutionException(com.google.common.util.concurrent.UncheckedExecutionException) ConvertibleHiveDataset(org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset) DateTime(org.joda.time.DateTime) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) SchemaNotFoundException(org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Example 5 with WatermarkInterval

use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.

the class KafkaWorkUnitPacker method getWatermarkIntervalFromWorkUnit.

@SuppressWarnings("deprecation")
protected static WatermarkInterval getWatermarkIntervalFromWorkUnit(WorkUnit workUnit) {
    if (workUnit instanceof MultiWorkUnit) {
        return getWatermarkIntervalFromMultiWorkUnit((MultiWorkUnit) workUnit);
    }
    List<Long> lowWatermarkValues = Lists.newArrayList(workUnit.getLowWaterMark());
    List<Long> expectedHighWatermarkValues = Lists.newArrayList(workUnit.getHighWaterMark());
    return new WatermarkInterval(new MultiLongWatermark(lowWatermarkValues), new MultiLongWatermark(expectedHighWatermarkValues));
}
Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiLongWatermark(org.apache.gobblin.source.extractor.extract.kafka.MultiLongWatermark)

Aggregations

WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)11 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)8 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)6 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)4 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)3 Map (java.util.Map)2 State (org.apache.gobblin.configuration.State)2 SchemaNotFoundException (org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException)2 ConvertibleHiveDataset (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset)2 UpdateNotFoundException (org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException)2 MultiLongWatermark (org.apache.gobblin.source.extractor.extract.kafka.MultiLongWatermark)2 Extract (org.apache.gobblin.source.workunit.Extract)2 DateTime (org.joda.time.DateTime)2 Predicate (com.google.common.base.Predicate)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 UncheckedExecutionException (com.google.common.util.concurrent.UncheckedExecutionException)1 IOException (java.io.IOException)1 LinkedList (java.util.LinkedList)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 Nonnull (javax.annotation.Nonnull)1