use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaBiLevelWorkUnitPacker method pack.
@Override
public List<WorkUnit> pack(Map<String, List<WorkUnit>> workUnitsByTopic, int numContainers) {
double totalEstDataSize = setWorkUnitEstSizes(workUnitsByTopic);
double avgGroupSize = totalEstDataSize / numContainers / getPreGroupingSizeFactor(this.state);
List<MultiWorkUnit> mwuGroups = Lists.newArrayList();
for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) {
double estimatedDataSizeForTopic = calcTotalEstSizeForTopic(workUnitsForTopic);
if (estimatedDataSizeForTopic < avgGroupSize) {
// If the total estimated size of a topic is smaller than group size, put all partitions of this
// topic in a single group.
MultiWorkUnit mwuGroup = MultiWorkUnit.createEmpty();
addWorkUnitsToMultiWorkUnit(workUnitsForTopic, mwuGroup);
mwuGroups.add(mwuGroup);
} else {
// Use best-fit-decreasing to group workunits for a topic into multiple groups.
mwuGroups.addAll(bestFitDecreasingBinPacking(workUnitsForTopic, avgGroupSize));
}
}
List<WorkUnit> groups = squeezeMultiWorkUnits(mwuGroups);
return worstFitDecreasingBinPacking(groups, numContainers);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaSingleLevelWorkUnitPacker method pack.
@Override
public List<WorkUnit> pack(Map<String, List<WorkUnit>> workUnitsByTopic, int numContainers) {
setWorkUnitEstSizes(workUnitsByTopic);
List<WorkUnit> workUnits = Lists.newArrayList();
for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) {
// For each topic, merge all empty workunits into a single workunit, so that a single
// empty task will be created instead of many.
MultiWorkUnit zeroSizeWorkUnit = MultiWorkUnit.createEmpty();
for (WorkUnit workUnit : workUnitsForTopic) {
if (DoubleMath.fuzzyEquals(getWorkUnitEstSize(workUnit), 0.0, EPS)) {
addWorkUnitToMultiWorkUnit(workUnit, zeroSizeWorkUnit);
} else {
workUnit.setWatermarkInterval(getWatermarkIntervalFromWorkUnit(workUnit));
workUnits.add(workUnit);
}
}
if (!zeroSizeWorkUnit.getWorkUnits().isEmpty()) {
workUnits.add(squeezeMultiWorkUnit(zeroSizeWorkUnit));
}
}
return worstFitDecreasingBinPacking(workUnits, numContainers);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaSource method createEmptyWorkUnitsForSkippedPartitions.
private void createEmptyWorkUnitsForSkippedPartitions(Map<String, List<WorkUnit>> workUnits, Map<String, State> topicSpecificStateMap, SourceState state) {
// in case the previous offset not been set
getAllPreviousOffsetState(state);
// it is not in this.partitionsToBeProcessed.
for (Map.Entry<KafkaPartition, Long> entry : this.previousOffsets.entrySet()) {
KafkaPartition partition = entry.getKey();
if (!this.partitionsToBeProcessed.contains(partition)) {
String topicName = partition.getTopicName();
if (!this.isDatasetStateEnabled.get() || this.topicsToProcess.contains(topicName)) {
long previousOffset = entry.getValue();
WorkUnit emptyWorkUnit = createEmptyWorkUnit(partition, previousOffset, this.previousOffsetFetchEpochTimes.get(partition), Optional.fromNullable(topicSpecificStateMap.get(partition.getTopicName())));
if (workUnits.containsKey(topicName)) {
workUnits.get(topicName).add(emptyWorkUnit);
} else {
workUnits.put(topicName, Lists.newArrayList(emptyWorkUnit));
}
}
}
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaSource method getWorkUnitForTopicPartition.
private WorkUnit getWorkUnitForTopicPartition(KafkaPartition partition, Offsets offsets, Optional<State> topicSpecificState) {
// Default to job level configurations
Extract.TableType currentTableType = tableType;
String currentExtractNamespace = extractNamespace;
String currentExtractTableName = partition.getTopicName();
boolean isCurrentFullExtract = isFullExtract;
// Update to topic specific configurations if any
if (topicSpecificState.isPresent()) {
State topicState = topicSpecificState.get();
if (topicState.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)) {
currentTableType = Extract.TableType.valueOf(topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY));
}
currentExtractNamespace = topicState.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, extractNamespace);
currentExtractTableName = topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, partition.getTopicName());
isCurrentFullExtract = topicState.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY, isFullExtract);
}
Extract extract = this.createExtract(currentTableType, currentExtractNamespace, currentExtractTableName);
if (isCurrentFullExtract) {
extract.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, true);
}
WorkUnit workUnit = WorkUnit.create(extract);
workUnit.setProp(TOPIC_NAME, partition.getTopicName());
addDatasetUrnOptionally(workUnit);
workUnit.setProp(PARTITION_ID, partition.getId());
workUnit.setProp(LEADER_ID, partition.getLeader().getId());
workUnit.setProp(LEADER_HOSTANDPORT, partition.getLeader().getHostAndPort().toString());
workUnit.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, offsets.getStartOffset());
workUnit.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, offsets.getLatestOffset());
workUnit.setProp(PREVIOUS_OFFSET_FETCH_EPOCH_TIME, offsets.getPreviousOffsetFetchEpochTime());
workUnit.setProp(OFFSET_FETCH_EPOCH_TIME, offsets.getOffsetFetchEpochTime());
workUnit.setProp(PREVIOUS_LATEST_OFFSET, offsets.getPreviousLatestOffset());
// Add lineage info
DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_KAFKA, partition.getTopicName());
source.addMetadata(DatasetConstants.BROKERS, kafkaBrokers);
if (this.lineageInfo.isPresent()) {
this.lineageInfo.get().setSource(source, workUnit);
}
LOG.info(String.format("Created workunit for partition %s: lowWatermark=%d, highWatermark=%d, range=%d", partition, offsets.getStartOffset(), offsets.getLatestOffset(), offsets.getLatestOffset() - offsets.getStartOffset()));
return workUnit;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaSource method getWorkUnitsForTopic.
/*
* This function need to be thread safe since it is called in the Runnable
*/
private List<WorkUnit> getWorkUnitsForTopic(KafkaTopic topic, SourceState state, Optional<State> topicSpecificState) {
Timer.Context context = this.metricContext.timer("isTopicQualifiedTimer").time();
boolean topicQualified = isTopicQualified(topic);
context.close();
List<WorkUnit> workUnits = Lists.newArrayList();
for (KafkaPartition partition : topic.getPartitions()) {
WorkUnit workUnit = getWorkUnitForTopicPartition(partition, state, topicSpecificState);
this.partitionsToBeProcessed.add(partition);
if (workUnit != null) {
// as the low watermark, so that it will be skipped.
if (!topicQualified) {
skipWorkUnit(workUnit);
}
workUnits.add(workUnit);
}
}
return workUnits;
}
Aggregations