Search in sources :

Example 6 with WatermarkInterval

use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.

the class KafkaWorkUnitPacker method getWatermarkIntervalFromMultiWorkUnit.

@SuppressWarnings("deprecation")
protected static WatermarkInterval getWatermarkIntervalFromMultiWorkUnit(MultiWorkUnit multiWorkUnit) {
    List<Long> lowWatermarkValues = Lists.newArrayList();
    List<Long> expectedHighWatermarkValues = Lists.newArrayList();
    for (WorkUnit workUnit : multiWorkUnit.getWorkUnits()) {
        lowWatermarkValues.add(workUnit.getLowWaterMark());
        expectedHighWatermarkValues.add(workUnit.getHighWaterMark());
    }
    return new WatermarkInterval(new MultiLongWatermark(lowWatermarkValues), new MultiLongWatermark(expectedHighWatermarkValues));
}
Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) MultiLongWatermark(org.apache.gobblin.source.extractor.extract.kafka.MultiLongWatermark) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 7 with WatermarkInterval

use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.

the class KafkaWorkUnitPacker method squeezeMultiWorkUnit.

/**
 * Combine all {@link WorkUnit}s in the {@link MultiWorkUnit} into a single {@link WorkUnit}.
 */
protected WorkUnit squeezeMultiWorkUnit(MultiWorkUnit multiWorkUnit) {
    WatermarkInterval interval = getWatermarkIntervalFromMultiWorkUnit(multiWorkUnit);
    List<KafkaPartition> partitions = getPartitionsFromMultiWorkUnit(multiWorkUnit);
    Preconditions.checkArgument(!partitions.isEmpty(), "There must be at least one partition in the multiWorkUnit");
    // Squeeze all partitions from the multiWorkUnit into of one the work units, which can be any one
    WorkUnit workUnit = multiWorkUnit.getWorkUnits().get(0);
    // Update interval
    workUnit.removeProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY);
    workUnit.removeProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY);
    workUnit.setWatermarkInterval(interval);
    // Update offset fetch epoch time and previous latest offset. These are used to compute the load factor,
    // gobblin consumption rate relative to the kafka production rate. The kafka rate is computed as
    // (current latest offset - previous latest offset)/(current epoch time - previous epoch time).
    int index = 0;
    for (WorkUnit wu : multiWorkUnit.getWorkUnits()) {
        workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME, index), wu.getProp(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME));
        workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.OFFSET_FETCH_EPOCH_TIME, index), wu.getProp(KafkaSource.OFFSET_FETCH_EPOCH_TIME));
        workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.PREVIOUS_LATEST_OFFSET, index), wu.getProp(KafkaSource.PREVIOUS_LATEST_OFFSET));
        index++;
    }
    workUnit.removeProp(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME);
    workUnit.removeProp(KafkaSource.OFFSET_FETCH_EPOCH_TIME);
    workUnit.removeProp(KafkaSource.PREVIOUS_LATEST_OFFSET);
    // Remove the original partition information
    workUnit.removeProp(KafkaSource.PARTITION_ID);
    workUnit.removeProp(KafkaSource.LEADER_ID);
    workUnit.removeProp(KafkaSource.LEADER_HOSTANDPORT);
    // Add combined partitions information
    populateMultiPartitionWorkUnit(partitions, workUnit);
    LOG.info(String.format("Created MultiWorkUnit for partitions %s", partitions));
    return workUnit;
}
Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) KafkaPartition(org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 8 with WatermarkInterval

use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.

the class PartitionLevelWatermarker method onGetWorkunitsEnd.

/**
 * Adds watermark workunits to <code>workunits</code>. A watermark workunit is a dummy workunit that is skipped by extractor/converter/writer.
 * It stores a map of watermarks. The map has one entry per partition with partition watermark as value.
 * <ul>
 * <li>Add one NoOp watermark workunit for each {@link Table}
 * <li>The workunit has an identifier property {@link #IS_WATERMARK_WORKUNIT_KEY} set to true.
 * <li>Watermarks for all {@link Partition}s that belong to this {@link Table} are added as {@link Map}
 * <li>A maximum of {@link #maxPartitionsPerDataset} are persisted. Watermarks are ordered by most recently modified {@link Partition}s
 *
 * </ul>
 * {@inheritDoc}
 * @see org.apache.gobblin.data.management.conversion.hive.watermarker.HiveSourceWatermarker#onGetWorkunitsEnd(java.util.List)
 */
@Override
public void onGetWorkunitsEnd(List<WorkUnit> workunits) {
    try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) {
        for (Map.Entry<String, Map<String, Long>> tableWatermark : this.expectedHighWatermarks.entrySet()) {
            String tableKey = tableWatermark.getKey();
            Map<String, Long> partitionWatermarks = tableWatermark.getValue();
            // tableKey is table complete name in the format db@table
            if (!HiveUtils.isPartitioned(new org.apache.hadoop.hive.ql.metadata.Table(client.get().getTable(tableKey.split("@")[0], tableKey.split("@")[1])))) {
                continue;
            }
            // We only keep watermarks for partitions that were updated after leastWatermarkToPersistInState
            Map<String, Long> expectedPartitionWatermarks = ImmutableMap.copyOf(Maps.filterEntries(partitionWatermarks, new Predicate<Map.Entry<String, Long>>() {

                @Override
                public boolean apply(@Nonnull Map.Entry<String, Long> input) {
                    return Long.compare(input.getValue(), PartitionLevelWatermarker.this.leastWatermarkToPersistInState) >= 0;
                }
            }));
            // Create dummy workunit to track all the partition watermarks for this table
            WorkUnit watermarkWorkunit = WorkUnit.createEmpty();
            watermarkWorkunit.setProp(IS_WATERMARK_WORKUNIT_KEY, true);
            watermarkWorkunit.setProp(ConfigurationKeys.DATASET_URN_KEY, tableKey);
            watermarkWorkunit.setWatermarkInterval(new WatermarkInterval(new MultiKeyValueLongWatermark(this.previousWatermarks.get(tableKey)), new MultiKeyValueLongWatermark(expectedPartitionWatermarks)));
            workunits.add(watermarkWorkunit);
        }
    } catch (IOException | TException e) {
        Throwables.propagate(e);
    }
}
Also used : TException(org.apache.thrift.TException) Table(org.apache.hadoop.hive.ql.metadata.Table) Nonnull(javax.annotation.Nonnull) IOException(java.io.IOException) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) Predicate(com.google.common.base.Predicate) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap)

Example 9 with WatermarkInterval

use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.

the class KafkaDeserializerExtractorTest method getMockWorkUnitState.

private WorkUnitState getMockWorkUnitState() {
    WorkUnit mockWorkUnit = WorkUnit.createEmpty();
    mockWorkUnit.setWatermarkInterval(new WatermarkInterval(new MultiLongWatermark(new ArrayList<Long>()), new MultiLongWatermark(new ArrayList<Long>())));
    WorkUnitState mockWorkUnitState = new WorkUnitState(mockWorkUnit, new State());
    mockWorkUnitState.setProp(KafkaSource.TOPIC_NAME, TEST_TOPIC_NAME);
    mockWorkUnitState.setProp(KafkaSource.PARTITION_ID, "1");
    mockWorkUnitState.setProp(ConfigurationKeys.KAFKA_BROKERS, "localhost:8080");
    mockWorkUnitState.setProp(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_URL, TEST_URL);
    return mockWorkUnitState;
}
Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 10 with WatermarkInterval

use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.

the class GoogleWebmasterExtractorTest method getWorkUnitState1.

public static WorkUnitState getWorkUnitState1() {
    WorkUnit wu = new WorkUnit(new Extract(Extract.TableType.APPEND_ONLY, "namespace", "table"));
    wu.setWatermarkInterval(new WatermarkInterval(new LongWatermark(20160101235959L), new LongWatermark(20160102235959L)));
    State js = new State();
    return new WorkUnitState(wu, js);
}
Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Aggregations

WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)11 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)8 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)6 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)4 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)3 Map (java.util.Map)2 State (org.apache.gobblin.configuration.State)2 SchemaNotFoundException (org.apache.gobblin.data.management.conversion.hive.avro.SchemaNotFoundException)2 ConvertibleHiveDataset (org.apache.gobblin.data.management.conversion.hive.dataset.ConvertibleHiveDataset)2 UpdateNotFoundException (org.apache.gobblin.data.management.conversion.hive.provider.UpdateNotFoundException)2 MultiLongWatermark (org.apache.gobblin.source.extractor.extract.kafka.MultiLongWatermark)2 Extract (org.apache.gobblin.source.workunit.Extract)2 DateTime (org.joda.time.DateTime)2 Predicate (com.google.common.base.Predicate)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 UncheckedExecutionException (com.google.common.util.concurrent.UncheckedExecutionException)1 IOException (java.io.IOException)1 LinkedList (java.util.LinkedList)1 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)1 Nonnull (javax.annotation.Nonnull)1