use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.
the class KafkaWorkUnitPacker method getWatermarkIntervalFromWorkUnit.
@SuppressWarnings("deprecation")
protected static WatermarkInterval getWatermarkIntervalFromWorkUnit(WorkUnit workUnit) {
if (workUnit instanceof MultiWorkUnit) {
return getWatermarkIntervalFromMultiWorkUnit((MultiWorkUnit) workUnit);
}
List<Long> lowWatermarkValues = Lists.newArrayList(workUnit.getLowWaterMark());
List<Long> expectedHighWatermarkValues = Lists.newArrayList(workUnit.getHighWaterMark());
return new WatermarkInterval(new MultiLongWatermark(lowWatermarkValues), new MultiLongWatermark(expectedHighWatermarkValues));
}
use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.
the class KafkaWorkUnitPacker method getWatermarkIntervalFromMultiWorkUnit.
@SuppressWarnings("deprecation")
protected static WatermarkInterval getWatermarkIntervalFromMultiWorkUnit(MultiWorkUnit multiWorkUnit) {
List<Long> lowWatermarkValues = Lists.newArrayList();
List<Long> expectedHighWatermarkValues = Lists.newArrayList();
for (WorkUnit workUnit : multiWorkUnit.getWorkUnits()) {
lowWatermarkValues.add(workUnit.getLowWaterMark());
expectedHighWatermarkValues.add(workUnit.getHighWaterMark());
}
return new WatermarkInterval(new MultiLongWatermark(lowWatermarkValues), new MultiLongWatermark(expectedHighWatermarkValues));
}
use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.
the class KafkaWorkUnitPacker method squeezeMultiWorkUnit.
/**
* Combine all {@link WorkUnit}s in the {@link MultiWorkUnit} into a single {@link WorkUnit}.
*/
protected WorkUnit squeezeMultiWorkUnit(MultiWorkUnit multiWorkUnit) {
WatermarkInterval interval = getWatermarkIntervalFromMultiWorkUnit(multiWorkUnit);
List<KafkaPartition> partitions = getPartitionsFromMultiWorkUnit(multiWorkUnit);
Preconditions.checkArgument(!partitions.isEmpty(), "There must be at least one partition in the multiWorkUnit");
// Squeeze all partitions from the multiWorkUnit into of one the work units, which can be any one
WorkUnit workUnit = multiWorkUnit.getWorkUnits().get(0);
// Update interval
workUnit.removeProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY);
workUnit.removeProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY);
workUnit.setWatermarkInterval(interval);
// Update offset fetch epoch time and previous latest offset. These are used to compute the load factor,
// gobblin consumption rate relative to the kafka production rate. The kafka rate is computed as
// (current latest offset - previous latest offset)/(current epoch time - previous epoch time).
int index = 0;
for (WorkUnit wu : multiWorkUnit.getWorkUnits()) {
workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME, index), wu.getProp(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME));
workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.OFFSET_FETCH_EPOCH_TIME, index), wu.getProp(KafkaSource.OFFSET_FETCH_EPOCH_TIME));
workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.PREVIOUS_LATEST_OFFSET, index), wu.getProp(KafkaSource.PREVIOUS_LATEST_OFFSET));
index++;
}
workUnit.removeProp(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME);
workUnit.removeProp(KafkaSource.OFFSET_FETCH_EPOCH_TIME);
workUnit.removeProp(KafkaSource.PREVIOUS_LATEST_OFFSET);
// Remove the original partition information
workUnit.removeProp(KafkaSource.PARTITION_ID);
workUnit.removeProp(KafkaSource.LEADER_ID);
workUnit.removeProp(KafkaSource.LEADER_HOSTANDPORT);
// Add combined partitions information
populateMultiPartitionWorkUnit(partitions, workUnit);
LOG.info(String.format("Created MultiWorkUnit for partitions %s", partitions));
return workUnit;
}
use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.
the class KafkaWorkUnitPacker method worstFitDecreasingBinPacking.
/**
* Pack a list of {@link WorkUnit}s into a smaller number of {@link MultiWorkUnit}s,
* using the worst-fit-decreasing algorithm.
*
* Each {@link WorkUnit} is assigned to the {@link MultiWorkUnit} with the smallest load.
*/
protected List<WorkUnit> worstFitDecreasingBinPacking(List<WorkUnit> groups, int numOfMultiWorkUnits) {
// Sort workunit groups by data size desc
Collections.sort(groups, LOAD_DESC_COMPARATOR);
MinMaxPriorityQueue<MultiWorkUnit> pQueue = MinMaxPriorityQueue.orderedBy(LOAD_ASC_COMPARATOR).expectedSize(numOfMultiWorkUnits).create();
for (int i = 0; i < numOfMultiWorkUnits; i++) {
MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty();
setWorkUnitEstSize(multiWorkUnit, 0);
pQueue.add(multiWorkUnit);
}
for (WorkUnit group : groups) {
MultiWorkUnit lightestMultiWorkUnit = pQueue.poll();
addWorkUnitToMultiWorkUnit(group, lightestMultiWorkUnit);
pQueue.add(lightestMultiWorkUnit);
}
logMultiWorkUnitInfo(pQueue);
double minLoad = getWorkUnitEstLoad(pQueue.peekFirst());
double maxLoad = getWorkUnitEstLoad(pQueue.peekLast());
LOG.info(String.format("Min load of multiWorkUnit = %f; Max load of multiWorkUnit = %f; Diff = %f%%", minLoad, maxLoad, (maxLoad - minLoad) / maxLoad * 100.0));
this.state.setProp(MIN_MULTIWORKUNIT_LOAD, minLoad);
this.state.setProp(MAX_MULTIWORKUNIT_LOAD, maxLoad);
List<WorkUnit> multiWorkUnits = Lists.newArrayList();
multiWorkUnits.addAll(pQueue);
return multiWorkUnits;
}
use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.
the class GobblinOutputCommitterTest method setupWorkUnitFiles.
@BeforeClass
public void setupWorkUnitFiles() throws IOException {
this.conf = new Configuration();
this.fs = FileSystem.getLocal(this.conf);
this.stagingDirs = Lists.newArrayList();
// Create a list of WorkUnits to serialize
WorkUnit wu1 = createAndSetWorkUnit("wu1");
WorkUnit wu2 = createAndSetWorkUnit("wu2");
WorkUnit wu3 = createAndSetWorkUnit("wu3");
WorkUnit wu4 = createAndSetWorkUnit("wu4");
// Create a MultiWorkUnit to serialize
MultiWorkUnit mwu1 = MultiWorkUnit.createEmpty();
mwu1.setProp(ConfigurationKeys.TASK_ID_KEY, System.nanoTime());
mwu1.addWorkUnits(Arrays.asList(wu3, wu4));
Path inputDir = new Path(new Path(OUTPUT_PATH, JOB_NAME), "input");
// Writer each WorkUnit to a separate file under inputDir
Closer closer = Closer.create();
try {
wu1.write(closer.register(this.fs.create(new Path(inputDir, wu1.getProp(ConfigurationKeys.TASK_ID_KEY) + Path.SEPARATOR + "_").suffix("wu"))));
wu2.write(closer.register(this.fs.create(new Path(inputDir, wu2.getProp(ConfigurationKeys.TASK_ID_KEY) + Path.SEPARATOR + "_").suffix("wu"))));
mwu1.write(closer.register(this.fs.create(new Path(inputDir, mwu1.getProp(ConfigurationKeys.TASK_ID_KEY) + Path.SEPARATOR + "_").suffix("mwu"))));
} finally {
closer.close();
}
}
Aggregations