use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaWorkUnitPacker method setWorkUnitEstSizes.
protected double setWorkUnitEstSizes(Map<String, List<WorkUnit>> workUnitsByTopic) {
double totalEstDataSize = 0;
for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) {
for (WorkUnit workUnit : workUnitsForTopic) {
setWorkUnitEstSize(workUnit);
totalEstDataSize += getWorkUnitEstSize(workUnit);
}
}
return totalEstDataSize;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaWorkUnitPacker method squeezeMultiWorkUnit.
/**
* Combine all {@link WorkUnit}s in the {@link MultiWorkUnit} into a single {@link WorkUnit}.
*/
protected WorkUnit squeezeMultiWorkUnit(MultiWorkUnit multiWorkUnit) {
WatermarkInterval interval = getWatermarkIntervalFromMultiWorkUnit(multiWorkUnit);
List<KafkaPartition> partitions = getPartitionsFromMultiWorkUnit(multiWorkUnit);
Preconditions.checkArgument(!partitions.isEmpty(), "There must be at least one partition in the multiWorkUnit");
// Squeeze all partitions from the multiWorkUnit into of one the work units, which can be any one
WorkUnit workUnit = multiWorkUnit.getWorkUnits().get(0);
// Update interval
workUnit.removeProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY);
workUnit.removeProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY);
workUnit.setWatermarkInterval(interval);
// Update offset fetch epoch time and previous latest offset. These are used to compute the load factor,
// gobblin consumption rate relative to the kafka production rate. The kafka rate is computed as
// (current latest offset - previous latest offset)/(current epoch time - previous epoch time).
int index = 0;
for (WorkUnit wu : multiWorkUnit.getWorkUnits()) {
workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME, index), wu.getProp(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME));
workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.OFFSET_FETCH_EPOCH_TIME, index), wu.getProp(KafkaSource.OFFSET_FETCH_EPOCH_TIME));
workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.PREVIOUS_LATEST_OFFSET, index), wu.getProp(KafkaSource.PREVIOUS_LATEST_OFFSET));
index++;
}
workUnit.removeProp(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME);
workUnit.removeProp(KafkaSource.OFFSET_FETCH_EPOCH_TIME);
workUnit.removeProp(KafkaSource.PREVIOUS_LATEST_OFFSET);
// Remove the original partition information
workUnit.removeProp(KafkaSource.PARTITION_ID);
workUnit.removeProp(KafkaSource.LEADER_ID);
workUnit.removeProp(KafkaSource.LEADER_HOSTANDPORT);
// Add combined partitions information
populateMultiPartitionWorkUnit(partitions, workUnit);
LOG.info(String.format("Created MultiWorkUnit for partitions %s", partitions));
return workUnit;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaWorkUnitPacker method worstFitDecreasingBinPacking.
/**
* Pack a list of {@link WorkUnit}s into a smaller number of {@link MultiWorkUnit}s,
* using the worst-fit-decreasing algorithm.
*
* Each {@link WorkUnit} is assigned to the {@link MultiWorkUnit} with the smallest load.
*/
protected List<WorkUnit> worstFitDecreasingBinPacking(List<WorkUnit> groups, int numOfMultiWorkUnits) {
// Sort workunit groups by data size desc
Collections.sort(groups, LOAD_DESC_COMPARATOR);
MinMaxPriorityQueue<MultiWorkUnit> pQueue = MinMaxPriorityQueue.orderedBy(LOAD_ASC_COMPARATOR).expectedSize(numOfMultiWorkUnits).create();
for (int i = 0; i < numOfMultiWorkUnits; i++) {
MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty();
setWorkUnitEstSize(multiWorkUnit, 0);
pQueue.add(multiWorkUnit);
}
for (WorkUnit group : groups) {
MultiWorkUnit lightestMultiWorkUnit = pQueue.poll();
addWorkUnitToMultiWorkUnit(group, lightestMultiWorkUnit);
pQueue.add(lightestMultiWorkUnit);
}
logMultiWorkUnitInfo(pQueue);
double minLoad = getWorkUnitEstLoad(pQueue.peekFirst());
double maxLoad = getWorkUnitEstLoad(pQueue.peekLast());
LOG.info(String.format("Min load of multiWorkUnit = %f; Max load of multiWorkUnit = %f; Diff = %f%%", minLoad, maxLoad, (maxLoad - minLoad) / maxLoad * 100.0));
this.state.setProp(MIN_MULTIWORKUNIT_LOAD, minLoad);
this.state.setProp(MAX_MULTIWORKUNIT_LOAD, maxLoad);
List<WorkUnit> multiWorkUnits = Lists.newArrayList();
multiWorkUnits.addAll(pQueue);
return multiWorkUnits;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
this.metricContext = Instrumented.getMetricContext(state, KafkaSource.class);
this.lineageInfo = LineageInfo.getLineageInfo(state.getBroker());
Map<String, List<WorkUnit>> workUnits = Maps.newConcurrentMap();
if (state.getPropAsBoolean(KafkaSource.GOBBLIN_KAFKA_EXTRACT_ALLOW_TABLE_TYPE_NAMESPACE_CUSTOMIZATION)) {
String tableTypeStr = state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, KafkaSource.DEFAULT_TABLE_TYPE.toString());
tableType = Extract.TableType.valueOf(tableTypeStr);
extractNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, KafkaSource.DEFAULT_NAMESPACE_NAME);
} else {
// To be compatible, reject table type and namespace configuration keys as previous implementation
tableType = KafkaSource.DEFAULT_TABLE_TYPE;
extractNamespace = KafkaSource.DEFAULT_NAMESPACE_NAME;
}
isFullExtract = state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY);
kafkaBrokers = state.getProp(ConfigurationKeys.KAFKA_BROKERS, "");
this.shouldEnableDatasetStateStore = state.getPropAsBoolean(GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE, DEFAULT_GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE);
try {
Config config = ConfigUtils.propertiesToConfig(state.getProperties());
GobblinKafkaConsumerClientFactory kafkaConsumerClientFactory = kafkaConsumerClientResolver.resolveClass(state.getProp(GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS, DEFAULT_GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS)).newInstance();
this.kafkaConsumerClient.set(kafkaConsumerClientFactory.create(config));
List<KafkaTopic> topics = getFilteredTopics(state);
this.topicsToProcess = topics.stream().map(KafkaTopic::getName).collect(toSet());
for (String topic : this.topicsToProcess) {
LOG.info("Discovered topic " + topic);
}
Map<String, State> topicSpecificStateMap = DatasetUtils.getDatasetSpecificProps(Iterables.transform(topics, new Function<KafkaTopic, String>() {
@Override
public String apply(KafkaTopic topic) {
return topic.getName();
}
}), state);
int numOfThreads = state.getPropAsInt(ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_THREADS, ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_DEFAULT_THREAD_COUNT);
ExecutorService threadPool = Executors.newFixedThreadPool(numOfThreads, ExecutorsUtils.newThreadFactory(Optional.of(LOG)));
if (state.getPropAsBoolean(ConfigurationKeys.KAFKA_SOURCE_SHARE_CONSUMER_CLIENT, ConfigurationKeys.DEFAULT_KAFKA_SOURCE_SHARE_CONSUMER_CLIENT)) {
this.sharedKafkaConsumerClient = this.kafkaConsumerClient.get();
} else {
// preallocate one client per thread
for (int i = 0; i < numOfThreads; i++) {
kafkaConsumerClientPool.offer(kafkaConsumerClientFactory.create(config));
}
}
Stopwatch createWorkUnitStopwatch = Stopwatch.createStarted();
for (KafkaTopic topic : topics) {
threadPool.submit(new WorkUnitCreator(topic, state, Optional.fromNullable(topicSpecificStateMap.get(topic.getName())), workUnits));
}
ExecutorsUtils.shutdownExecutorService(threadPool, Optional.of(LOG), 1L, TimeUnit.HOURS);
LOG.info(String.format("Created workunits for %d topics in %d seconds", workUnits.size(), createWorkUnitStopwatch.elapsed(TimeUnit.SECONDS)));
// Create empty WorkUnits for skipped partitions (i.e., partitions that have previous offsets,
// but aren't processed).
createEmptyWorkUnitsForSkippedPartitions(workUnits, topicSpecificStateMap, state);
int numOfMultiWorkunits = state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS);
List<WorkUnit> workUnitList = KafkaWorkUnitPacker.getInstance(this, state).pack(workUnits, numOfMultiWorkunits);
addTopicSpecificPropsToWorkUnits(workUnitList, topicSpecificStateMap);
setLimiterReportKeyListToWorkUnits(workUnitList, getLimiterExtractorReportKeys());
return workUnitList;
} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
throw new RuntimeException(e);
} finally {
try {
if (this.kafkaConsumerClient.get() != null) {
this.kafkaConsumerClient.get().close();
}
// cleanup clients from pool
for (GobblinKafkaConsumerClient client : kafkaConsumerClientPool) {
client.close();
}
} catch (IOException e) {
throw new RuntimeException("Exception closing kafkaConsumerClient");
}
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class TestAvroSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
WorkUnit workUnit = WorkUnit.createEmpty();
workUnit.addAll(state);
return Collections.singletonList(workUnit);
}
Aggregations