Examples with MultiWorkUnit - org.apache.gobblin.source.workunit.MultiWorkUnit

Example 6 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class WorstFitDecreasingBinPacking method pack.

@Override
@OverridingMethodsMustInvokeSuper
public List<WorkUnit> pack(List<WorkUnit> workUnitsIn, WorkUnitWeighter weighter) {
    if (this.maxWeightPerUnit <= 0) {
        // just return the input
        return workUnitsIn;
    }
    List<WorkUnit> workUnits = Lists.newArrayList(workUnitsIn);
    // total size of work units smaller than maxWeightPerUnit
    long smallUnitSize = 0;
    // number of work units larger than maxWeightPerUnit
    int largeUnits = 0;
    for (WorkUnit workUnit : workUnits) {
        long weight = weighter.weight(workUnit);
        if (weight <= this.maxWeightPerUnit) {
            smallUnitSize += weight;
        } else {
            largeUnits++;
        }
    }
    int estimateByWeight = largeUnits + (int) ((smallUnitSize - 1) / this.maxWeightPerUnit) + 1;
    int estimatedMultiWorkUnits = Math.min(estimateByWeight, workUnits.size());
    MinMaxPriorityQueue<MultiWorkUnit> pQueue = MinMaxPriorityQueue.orderedBy(new MultiWorkUnitComparator()).create();
    for (int i = 0; i < estimatedMultiWorkUnits; i++) {
        pQueue.add(MultiWorkUnit.createEmpty());
    }
    Collections.sort(workUnits, Collections.reverseOrder(new WeightComparator(weighter)));
    for (WorkUnit workUnit : workUnits) {
        MultiWorkUnit lightestMultiWorkUnit = pQueue.peek();
        long weight = Math.max(1, weighter.weight(workUnit));
        long multiWorkUnitWeight = getMultiWorkUnitWeight(lightestMultiWorkUnit);
        if (multiWorkUnitWeight == 0 || (weight + multiWorkUnitWeight <= this.maxWeightPerUnit && weight + multiWorkUnitWeight > multiWorkUnitWeight)) {
            // check for overflow
            // if it fits, add it to lightest work unit
            addToMultiWorkUnit(lightestMultiWorkUnit, workUnit, weight);
            pQueue.poll();
            pQueue.add(lightestMultiWorkUnit);
        } else {
            // if doesn't fit in lightest multi work unit, create a new work unit for it
            MultiWorkUnit newMultiWorkUnit = MultiWorkUnit.createEmpty();
            addToMultiWorkUnit(newMultiWorkUnit, workUnit, weight);
            pQueue.add(newMultiWorkUnit);
        }
    }
    return Lists.<WorkUnit>newArrayList(Iterables.filter(pQueue, new Predicate<MultiWorkUnit>() {

        @Override
        @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP_PARAMETER_MUST_BE_NONNULL_BUT_MARKED_AS_NULLABLE", justification = "Allowing nullable values")
        public boolean apply(@Nullable MultiWorkUnit input) {
            return getMultiWorkUnitWeight(input) > 0;
        }
    }));
}

Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Nullable(javax.annotation.Nullable) Predicate(com.google.common.base.Predicate) OverridingMethodsMustInvokeSuper(javax.annotation.OverridingMethodsMustInvokeSuper)

Example 7 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method testJobStateNotCopiedToWorkUnit.

@Test
public void testJobStateNotCopiedToWorkUnit() {
    DatePartitionedAvroFileSource source = new DatePartitionedAvroFileSource();
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
    state.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_DATA_DIRECTORY, OUTPUT_DIR + Path.SEPARATOR + SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_ENTITY, SOURCE_ENTITY);
    state.setProp(ConfigurationKeys.SOURCE_MAX_NUMBER_OF_PARTITIONS, 2);
    state.setProp("date.partitioned.source.partition.pattern", DATE_PATTERN);
    state.setProp("date.partitioned.source.min.watermark.value", DateTimeFormat.forPattern(DATE_PATTERN).print(this.startDateTime.minusMinutes(1)));
    state.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, TableType.SNAPSHOT_ONLY);
    state.setProp("date.partitioned.source.partition.prefix", PREFIX);
    state.setProp("date.partitioned.source.partition.suffix", SUFFIX);
    String dummyKey = "dummy.job.config";
    state.setProp(dummyKey, "dummy");
    List<WorkUnit> workunits = source.getWorkunits(state);
    Assert.assertEquals(workunits.size(), 4);
    for (WorkUnit wu : workunits) {
        if (wu instanceof MultiWorkUnit) {
            for (WorkUnit workUnit : ((MultiWorkUnit) wu).getWorkUnits()) {
                Assert.assertFalse(workUnit.contains(dummyKey));
            }
        } else {
            Assert.assertFalse(wu.contains(dummyKey));
        }
    }
}

Also used : SourceState(org.apache.gobblin.configuration.SourceState) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) DatePartitionedAvroFileSource(org.apache.gobblin.source.DatePartitionedAvroFileSource) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 8 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class DatePartitionedAvroFileExtractorTest method verifyWorkUnits.

private void verifyWorkUnits(List<WorkUnit> workunits, int expectedSize) throws DataRecordException, IOException {
    for (int i = 0; i < expectedSize; i++) {
        WorkUnit workUnit = ((MultiWorkUnit) workunits.get(i)).getWorkUnits().get(0);
        WorkUnitState wuState = new WorkUnitState(workunits.get(i), new State());
        wuState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
        wuState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, workUnit.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL));
        try (DatePartitionedAvroFileExtractor extractor = new DatePartitionedAvroFileExtractor(wuState)) {
            GenericRecord record = extractor.readRecord(null);
            Assert.assertEquals(recordTimestamps[i], record.get(PARTITION_COLUMN_NAME));
            Assert.assertEquals(recordTimestamps[i], workUnit.getPropAsLong(ConfigurationKeys.WORK_UNIT_DATE_PARTITION_KEY));
        }
    }
}

Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) GenericRecord(org.apache.avro.generic.GenericRecord)

Example 9 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class KafkaBiLevelWorkUnitPacker method findAndPopBestFitGroup.

/**
 * Find the best group using the best-fit-decreasing algorithm.
 * The best group is the fullest group that has enough capacity for the new {@link WorkUnit}.
 * If no existing group has enough capacity for the new {@link WorkUnit}, return null.
 */
private static MultiWorkUnit findAndPopBestFitGroup(WorkUnit workUnit, PriorityQueue<MultiWorkUnit> pQueue, double avgGroupSize) {
    List<MultiWorkUnit> fullWorkUnits = Lists.newArrayList();
    MultiWorkUnit bestFit = null;
    while (!pQueue.isEmpty()) {
        MultiWorkUnit candidate = pQueue.poll();
        if (getWorkUnitEstSize(candidate) + getWorkUnitEstSize(workUnit) <= avgGroupSize) {
            bestFit = candidate;
            break;
        }
        fullWorkUnits.add(candidate);
    }
    for (MultiWorkUnit fullWorkUnit : fullWorkUnits) {
        pQueue.add(fullWorkUnit);
    }
    return bestFit;
}

Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit)

Example 10 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class KafkaBiLevelWorkUnitPacker method bestFitDecreasingBinPacking.

/**
 * Group {@link WorkUnit}s into groups. Each group is a {@link MultiWorkUnit}. Each group has a capacity of
 * avgGroupSize. If there's a single {@link WorkUnit} whose size is larger than avgGroupSize, it forms a group itself.
 */
private static List<MultiWorkUnit> bestFitDecreasingBinPacking(List<WorkUnit> workUnits, double avgGroupSize) {
    // Sort workunits by data size desc
    Collections.sort(workUnits, LOAD_DESC_COMPARATOR);
    PriorityQueue<MultiWorkUnit> pQueue = new PriorityQueue<>(workUnits.size(), LOAD_DESC_COMPARATOR);
    for (WorkUnit workUnit : workUnits) {
        MultiWorkUnit bestGroup = findAndPopBestFitGroup(workUnit, pQueue, avgGroupSize);
        if (bestGroup != null) {
            addWorkUnitToMultiWorkUnit(workUnit, bestGroup);
        } else {
            bestGroup = MultiWorkUnit.createEmpty();
            addWorkUnitToMultiWorkUnit(workUnit, bestGroup);
        }
        pQueue.add(bestGroup);
    }
    return Lists.newArrayList(pQueue);
}

Aggregations

MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)23 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)20 Path (org.apache.hadoop.fs.Path)5 Closer (com.google.common.io.Closer)4 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)3 Test (org.testng.annotations.Test)3 DataInputStream (java.io.DataInputStream)2 SourceState (org.apache.gobblin.configuration.SourceState)2 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)2 MultiLongWatermark (org.apache.gobblin.source.extractor.extract.kafka.MultiLongWatermark)2 ParallelRunner (org.apache.gobblin.util.ParallelRunner)2 Configuration (org.apache.hadoop.conf.Configuration)2 BeforeClass (org.testng.annotations.BeforeClass)2 Predicate (com.google.common.base.Predicate)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataOutputStream (java.io.DataOutputStream)1 IOException (java.io.IOException)1 URI (java.net.URI)1 PriorityQueue (java.util.PriorityQueue)1