use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.
the class KafkaWorkUnitPacker method getWatermarkIntervalFromMultiWorkUnit.
@SuppressWarnings("deprecation")
protected static WatermarkInterval getWatermarkIntervalFromMultiWorkUnit(MultiWorkUnit multiWorkUnit) {
List<Long> lowWatermarkValues = Lists.newArrayList();
List<Long> expectedHighWatermarkValues = Lists.newArrayList();
for (WorkUnit workUnit : multiWorkUnit.getWorkUnits()) {
lowWatermarkValues.add(workUnit.getLowWaterMark());
expectedHighWatermarkValues.add(workUnit.getHighWaterMark());
}
return new WatermarkInterval(new MultiLongWatermark(lowWatermarkValues), new MultiLongWatermark(expectedHighWatermarkValues));
}
use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.
the class KafkaWorkUnitPacker method squeezeMultiWorkUnit.
/**
* Combine all {@link WorkUnit}s in the {@link MultiWorkUnit} into a single {@link WorkUnit}.
*/
protected WorkUnit squeezeMultiWorkUnit(MultiWorkUnit multiWorkUnit) {
WatermarkInterval interval = getWatermarkIntervalFromMultiWorkUnit(multiWorkUnit);
List<KafkaPartition> partitions = getPartitionsFromMultiWorkUnit(multiWorkUnit);
Preconditions.checkArgument(!partitions.isEmpty(), "There must be at least one partition in the multiWorkUnit");
// Squeeze all partitions from the multiWorkUnit into of one the work units, which can be any one
WorkUnit workUnit = multiWorkUnit.getWorkUnits().get(0);
// Update interval
workUnit.removeProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY);
workUnit.removeProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY);
workUnit.setWatermarkInterval(interval);
// Update offset fetch epoch time and previous latest offset. These are used to compute the load factor,
// gobblin consumption rate relative to the kafka production rate. The kafka rate is computed as
// (current latest offset - previous latest offset)/(current epoch time - previous epoch time).
int index = 0;
for (WorkUnit wu : multiWorkUnit.getWorkUnits()) {
workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME, index), wu.getProp(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME));
workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.OFFSET_FETCH_EPOCH_TIME, index), wu.getProp(KafkaSource.OFFSET_FETCH_EPOCH_TIME));
workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.PREVIOUS_LATEST_OFFSET, index), wu.getProp(KafkaSource.PREVIOUS_LATEST_OFFSET));
index++;
}
workUnit.removeProp(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME);
workUnit.removeProp(KafkaSource.OFFSET_FETCH_EPOCH_TIME);
workUnit.removeProp(KafkaSource.PREVIOUS_LATEST_OFFSET);
// Remove the original partition information
workUnit.removeProp(KafkaSource.PARTITION_ID);
workUnit.removeProp(KafkaSource.LEADER_ID);
workUnit.removeProp(KafkaSource.LEADER_HOSTANDPORT);
// Add combined partitions information
populateMultiPartitionWorkUnit(partitions, workUnit);
LOG.info(String.format("Created MultiWorkUnit for partitions %s", partitions));
return workUnit;
}
use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.
the class PartitionLevelWatermarker method onGetWorkunitsEnd.
/**
* Adds watermark workunits to <code>workunits</code>. A watermark workunit is a dummy workunit that is skipped by extractor/converter/writer.
* It stores a map of watermarks. The map has one entry per partition with partition watermark as value.
* <ul>
* <li>Add one NoOp watermark workunit for each {@link Table}
* <li>The workunit has an identifier property {@link #IS_WATERMARK_WORKUNIT_KEY} set to true.
* <li>Watermarks for all {@link Partition}s that belong to this {@link Table} are added as {@link Map}
* <li>A maximum of {@link #maxPartitionsPerDataset} are persisted. Watermarks are ordered by most recently modified {@link Partition}s
*
* </ul>
* {@inheritDoc}
* @see org.apache.gobblin.data.management.conversion.hive.watermarker.HiveSourceWatermarker#onGetWorkunitsEnd(java.util.List)
*/
@Override
public void onGetWorkunitsEnd(List<WorkUnit> workunits) {
try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) {
for (Map.Entry<String, Map<String, Long>> tableWatermark : this.expectedHighWatermarks.entrySet()) {
String tableKey = tableWatermark.getKey();
Map<String, Long> partitionWatermarks = tableWatermark.getValue();
// tableKey is table complete name in the format db@table
if (!HiveUtils.isPartitioned(new org.apache.hadoop.hive.ql.metadata.Table(client.get().getTable(tableKey.split("@")[0], tableKey.split("@")[1])))) {
continue;
}
// We only keep watermarks for partitions that were updated after leastWatermarkToPersistInState
Map<String, Long> expectedPartitionWatermarks = ImmutableMap.copyOf(Maps.filterEntries(partitionWatermarks, new Predicate<Map.Entry<String, Long>>() {
@Override
public boolean apply(@Nonnull Map.Entry<String, Long> input) {
return Long.compare(input.getValue(), PartitionLevelWatermarker.this.leastWatermarkToPersistInState) >= 0;
}
}));
// Create dummy workunit to track all the partition watermarks for this table
WorkUnit watermarkWorkunit = WorkUnit.createEmpty();
watermarkWorkunit.setProp(IS_WATERMARK_WORKUNIT_KEY, true);
watermarkWorkunit.setProp(ConfigurationKeys.DATASET_URN_KEY, tableKey);
watermarkWorkunit.setWatermarkInterval(new WatermarkInterval(new MultiKeyValueLongWatermark(this.previousWatermarks.get(tableKey)), new MultiKeyValueLongWatermark(expectedPartitionWatermarks)));
workunits.add(watermarkWorkunit);
}
} catch (IOException | TException e) {
Throwables.propagate(e);
}
}
use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.
the class KafkaDeserializerExtractorTest method getMockWorkUnitState.
private WorkUnitState getMockWorkUnitState() {
WorkUnit mockWorkUnit = WorkUnit.createEmpty();
mockWorkUnit.setWatermarkInterval(new WatermarkInterval(new MultiLongWatermark(new ArrayList<Long>()), new MultiLongWatermark(new ArrayList<Long>())));
WorkUnitState mockWorkUnitState = new WorkUnitState(mockWorkUnit, new State());
mockWorkUnitState.setProp(KafkaSource.TOPIC_NAME, TEST_TOPIC_NAME);
mockWorkUnitState.setProp(KafkaSource.PARTITION_ID, "1");
mockWorkUnitState.setProp(ConfigurationKeys.KAFKA_BROKERS, "localhost:8080");
mockWorkUnitState.setProp(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_URL, TEST_URL);
return mockWorkUnitState;
}
use of org.apache.gobblin.source.extractor.WatermarkInterval in project incubator-gobblin by apache.
the class GoogleWebmasterExtractorTest method getWorkUnitState1.
public static WorkUnitState getWorkUnitState1() {
WorkUnit wu = new WorkUnit(new Extract(Extract.TableType.APPEND_ONLY, "namespace", "table"));
wu.setWatermarkInterval(new WatermarkInterval(new LongWatermark(20160101235959L), new LongWatermark(20160102235959L)));
State js = new State();
return new WorkUnitState(wu, js);
}
Aggregations