Search in sources :

Example 1 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class SingleTask method getWorkUnits.

private List<WorkUnit> getWorkUnits() throws IOException {
    String fileName = _workUnitFilePath.getName();
    String storeName = _workUnitFilePath.getParent().getName();
    WorkUnit workUnit;
    if (_workUnitFilePath.getName().endsWith(AbstractJobLauncher.MULTI_WORK_UNIT_FILE_EXTENSION)) {
        workUnit = _stateStores.getMwuStateStore().getAll(storeName, fileName).get(0);
    } else {
        workUnit = _stateStores.getWuStateStore().getAll(storeName, fileName).get(0);
    }
    // The list of individual WorkUnits (flattened) to run
    List<WorkUnit> workUnits = Lists.newArrayList();
    if (workUnit instanceof MultiWorkUnit) {
        // Flatten the MultiWorkUnit so the job configuration properties can be added to each individual WorkUnits
        List<WorkUnit> flattenedWorkUnits = JobLauncherUtils.flattenWorkUnits(((MultiWorkUnit) workUnit).getWorkUnits());
        workUnits.addAll(flattenedWorkUnits);
    } else {
        workUnits.add(workUnit);
    }
    return workUnits;
}
Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 2 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class KafkaBiLevelWorkUnitPacker method pack.

@Override
public List<WorkUnit> pack(Map<String, List<WorkUnit>> workUnitsByTopic, int numContainers) {
    double totalEstDataSize = setWorkUnitEstSizes(workUnitsByTopic);
    double avgGroupSize = totalEstDataSize / numContainers / getPreGroupingSizeFactor(this.state);
    List<MultiWorkUnit> mwuGroups = Lists.newArrayList();
    for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) {
        double estimatedDataSizeForTopic = calcTotalEstSizeForTopic(workUnitsForTopic);
        if (estimatedDataSizeForTopic < avgGroupSize) {
            // If the total estimated size of a topic is smaller than group size, put all partitions of this
            // topic in a single group.
            MultiWorkUnit mwuGroup = MultiWorkUnit.createEmpty();
            addWorkUnitsToMultiWorkUnit(workUnitsForTopic, mwuGroup);
            mwuGroups.add(mwuGroup);
        } else {
            // Use best-fit-decreasing to group workunits for a topic into multiple groups.
            mwuGroups.addAll(bestFitDecreasingBinPacking(workUnitsForTopic, avgGroupSize));
        }
    }
    List<WorkUnit> groups = squeezeMultiWorkUnits(mwuGroups);
    return worstFitDecreasingBinPacking(groups, numContainers);
}
Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 3 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class KafkaSingleLevelWorkUnitPacker method pack.

@Override
public List<WorkUnit> pack(Map<String, List<WorkUnit>> workUnitsByTopic, int numContainers) {
    setWorkUnitEstSizes(workUnitsByTopic);
    List<WorkUnit> workUnits = Lists.newArrayList();
    for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) {
        // For each topic, merge all empty workunits into a single workunit, so that a single
        // empty task will be created instead of many.
        MultiWorkUnit zeroSizeWorkUnit = MultiWorkUnit.createEmpty();
        for (WorkUnit workUnit : workUnitsForTopic) {
            if (DoubleMath.fuzzyEquals(getWorkUnitEstSize(workUnit), 0.0, EPS)) {
                addWorkUnitToMultiWorkUnit(workUnit, zeroSizeWorkUnit);
            } else {
                workUnit.setWatermarkInterval(getWatermarkIntervalFromWorkUnit(workUnit));
                workUnits.add(workUnit);
            }
        }
        if (!zeroSizeWorkUnit.getWorkUnits().isEmpty()) {
            workUnits.add(squeezeMultiWorkUnit(zeroSizeWorkUnit));
        }
    }
    return worstFitDecreasingBinPacking(workUnits, numContainers);
}
Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 4 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class MultiWorkUnitUnpackingIteratorTest method createWorkUnit.

private WorkUnit createWorkUnit(String... names) {
    if (names.length == 1) {
        WorkUnit workUnit = new WorkUnit();
        workUnit.setProp(WORK_UNIT_NAME, names[0]);
        return workUnit;
    }
    MultiWorkUnit mwu = new MultiWorkUnit();
    for (String name : names) {
        mwu.addWorkUnit(createWorkUnit(name));
    }
    return mwu;
}
Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 5 with MultiWorkUnit

use of org.apache.gobblin.source.workunit.MultiWorkUnit in project incubator-gobblin by apache.

the class TestSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    String nameSpace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY);
    Extract extract1 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable1");
    Extract extract2 = createExtract(TableType.SNAPSHOT_ONLY, nameSpace, "TestTable2");
    String sourceFileList = state.getProp(SOURCE_FILE_LIST_KEY);
    List<String> list = SPLITTER.splitToList(sourceFileList);
    List<WorkUnit> workUnits = Lists.newArrayList();
    for (int i = 0; i < list.size(); i++) {
        WorkUnit workUnit = WorkUnit.create(i % 2 == 0 ? extract1 : extract2);
        workUnit.setProp(SOURCE_FILE_KEY, list.get(i));
        workUnits.add(workUnit);
    }
    if (state.getPropAsBoolean("use.multiworkunit", false)) {
        MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty();
        multiWorkUnit.addWorkUnits(workUnits);
        workUnits.clear();
        workUnits.add(multiWorkUnit);
    }
    return workUnits;
}
Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) Extract(org.apache.gobblin.source.workunit.Extract) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Aggregations

MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)23 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)20 Path (org.apache.hadoop.fs.Path)5 Closer (com.google.common.io.Closer)4 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)3 Test (org.testng.annotations.Test)3 DataInputStream (java.io.DataInputStream)2 SourceState (org.apache.gobblin.configuration.SourceState)2 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)2 MultiLongWatermark (org.apache.gobblin.source.extractor.extract.kafka.MultiLongWatermark)2 ParallelRunner (org.apache.gobblin.util.ParallelRunner)2 Configuration (org.apache.hadoop.conf.Configuration)2 BeforeClass (org.testng.annotations.BeforeClass)2 Predicate (com.google.common.base.Predicate)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 DataOutputStream (java.io.DataOutputStream)1 IOException (java.io.IOException)1 URI (java.net.URI)1 PriorityQueue (java.util.PriorityQueue)1