use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.
the class WorstFitDecreasingBinPacking method pack.
@Override
@OverridingMethodsMustInvokeSuper
public List<WorkUnit> pack(List<WorkUnit> workUnitsIn, WorkUnitWeighter weighter) {
if (this.maxWeightPerUnit <= 0) {
// just return the input
return workUnitsIn;
}
List<WorkUnit> workUnits = Lists.newArrayList(workUnitsIn);
// total size of work units smaller than maxWeightPerUnit
long smallUnitSize = 0;
// number of work units larger than maxWeightPerUnit
int largeUnits = 0;
for (WorkUnit workUnit : workUnits) {
long weight = weighter.weight(workUnit);
if (weight <= this.maxWeightPerUnit) {
smallUnitSize += weight;
} else {
largeUnits++;
}
}
int estimateByWeight = largeUnits + (int) ((smallUnitSize - 1) / this.maxWeightPerUnit) + 1;
int estimatedMultiWorkUnits = Math.min(estimateByWeight, workUnits.size());
MinMaxPriorityQueue<MultiWorkUnit> pQueue = MinMaxPriorityQueue.orderedBy(new MultiWorkUnitComparator()).create();
for (int i = 0; i < estimatedMultiWorkUnits; i++) {
pQueue.add(MultiWorkUnit.createEmpty());
}
Collections.sort(workUnits, Collections.reverseOrder(new WeightComparator(weighter)));
for (WorkUnit workUnit : workUnits) {
MultiWorkUnit lightestMultiWorkUnit = pQueue.peek();
long weight = Math.max(1, weighter.weight(workUnit));
long multiWorkUnitWeight = getMultiWorkUnitWeight(lightestMultiWorkUnit);
if (multiWorkUnitWeight == 0 || (weight + multiWorkUnitWeight <= this.maxWeightPerUnit && weight + multiWorkUnitWeight > multiWorkUnitWeight)) {
// check for overflow
// if it fits, add it to lightest work unit
addToMultiWorkUnit(lightestMultiWorkUnit, workUnit, weight);
pQueue.poll();
pQueue.add(lightestMultiWorkUnit);
} else {
// if doesn't fit in lightest multi work unit, create a new work unit for it
MultiWorkUnit newMultiWorkUnit = MultiWorkUnit.createEmpty();
addToMultiWorkUnit(newMultiWorkUnit, workUnit, weight);
pQueue.add(newMultiWorkUnit);
}
}
return Lists.<WorkUnit>newArrayList(Iterables.filter(pQueue, new Predicate<MultiWorkUnit>() {
@Override
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP_PARAMETER_MUST_BE_NONNULL_BUT_MARKED_AS_NULLABLE", justification = "Allowing nullable values")
public boolean apply(@Nullable MultiWorkUnit input) {
return getMultiWorkUnitWeight(input) > 0;
}
}));
}
use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.
the class DatePartitionedAvroFileExtractorTest method testJobStateNotCopiedToWorkUnit.
@Test
public void testJobStateNotCopiedToWorkUnit() {
DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
SourceState state = new SourceState();
state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
state.setProp("date.partitioned.source.partition.prefix", PREFIX);
state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
String dummyKey = "dummy.job.config";
state.setProp(dummyKey, "dummy");
List<WorkUnit> workunits = source.getWorkunits(state);
Assert.assertEquals(workunits.size(), 4);
for (WorkUnit wu : workunits) {
if (wu instanceof MultiWorkUnit) {
for (WorkUnit workUnit : ((MultiWorkUnit) wu).getWorkUnits()) {
Assert.assertFalse(workUnit.contains(dummyKey));
}
} else {
Assert.assertFalse(wu.contains(dummyKey));
}
}
}
use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.
the class DatePartitionedAvroFileExtractorTest method verifyWorkUnits.
private void verifyWorkUnits(List<WorkUnit> workunits, int expectedSize) throws DataRecordException, IOException {
for (int i = 0; i < expectedSize; i++) {
WorkUnit workUnit = ((MultiWorkUnit) workunits.get(i)).getWorkUnits().get(0);
WorkUnitState wuState = new WorkUnitState(workunits.get(i), new State());
wuState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
wuState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, workUnit.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL));
try (DatePartitionedAvroFileExtractor extractor = new DatePartitionedAvroFileExtractor(wuState)) {
GenericRecord record = extractor.readRecord(null);
Assert.assertEquals(recordTimestamps[i], record.get(PARTITION_COLUMN_NAME));
Assert.assertEquals(recordTimestamps[i], workUnit.getPropAsLong(ConfigurationKeys.WORK_UNIT_DATE_PARTITION_KEY));
}
}
}
use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.
the class KafkaBiLevelWorkUnitPacker method findAndPopBestFitGroup.
/**
* Find the best group using the best-fit-decreasing algorithm.
* The best group is the fullest group that has enough capacity for the new {@link WorkUnit}.
* If no existing group has enough capacity for the new {@link WorkUnit}, return null.
*/
private static MultiWorkUnit findAndPopBestFitGroup(WorkUnit workUnit, PriorityQueue<MultiWorkUnit> pQueue, double avgGroupSize) {
List<MultiWorkUnit> fullWorkUnits = Lists.newArrayList();
MultiWorkUnit bestFit = null;
while (!pQueue.isEmpty()) {
MultiWorkUnit candidate = pQueue.poll();
if (getWorkUnitEstSize(candidate) + getWorkUnitEstSize(workUnit) <= avgGroupSize) {
bestFit = candidate;
break;
}
fullWorkUnits.add(candidate);
}
for (MultiWorkUnit fullWorkUnit : fullWorkUnits) {
pQueue.add(fullWorkUnit);
}
return bestFit;
}
use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.
the class KafkaBiLevelWorkUnitPacker method bestFitDecreasingBinPacking.
/**
* Group {@link WorkUnit}s into groups. Each group is a {@link MultiWorkUnit}. Each group has a capacity of
* avgGroupSize. If there's a single {@link WorkUnit} whose size is larger than avgGroupSize, it forms a group itself.
*/
private static List<MultiWorkUnit> bestFitDecreasingBinPacking(List<WorkUnit> workUnits, double avgGroupSize) {
// Sort workunits by data size desc
Collections.sort(workUnits, LOAD_DESC_COMPARATOR);
PriorityQueue<MultiWorkUnit> pQueue = new PriorityQueue<>(workUnits.size(), LOAD_DESC_COMPARATOR);
for (WorkUnit workUnit : workUnits) {
MultiWorkUnit bestGroup = findAndPopBestFitGroup(workUnit, pQueue, avgGroupSize);
if (bestGroup != null) {
addWorkUnitToMultiWorkUnit(workUnit, bestGroup);
} else {
bestGroup = MultiWorkUnit.createEmpty();
addWorkUnitToMultiWorkUnit(workUnit, bestGroup);
}
pQueue.add(bestGroup);
}
return Lists.newArrayList(pQueue);
}
Aggregations