Search in sources :

Example 51 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HelloWorldSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    Config rootCfg = ConfigUtils.propertiesToConfig(state.getProperties());
    Config cfg = rootCfg.hasPath(CONFIG_NAMESPACE) ? rootCfg.getConfig(CONFIG_NAMESPACE) : ConfigFactory.empty();
    int numHellos = cfg.hasPath(NUM_HELLOS_KEY) ? cfg.getInt(NUM_HELLOS_KEY) : DEFAULT_NUM_HELLOS;
    Extract extract = new Extract(TableType.APPEND_ONLY, HelloWorldSource.class.getPackage().getName(), HelloWorldSource.class.getSimpleName());
    List<WorkUnit> wus = new ArrayList<>(numHellos);
    for (int i = 1; i <= numHellos; ++i) {
        WorkUnit wu = new WorkUnit(extract);
        wu.setProp(HELLO_ID_FULL_KEY, i);
        wus.add(wu);
    }
    return wus;
}
Also used : Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 52 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class StressTestingSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    int numWorkUnits = state.getPropAsInt(NUM_WORK_UNITS_KEY, DEFAULT_NUM_WORK_UNITS);
    Extract extract = new Extract(TableType.APPEND_ONLY, StressTestingSource.class.getPackage().getName(), StressTestingSource.class.getSimpleName());
    List<WorkUnit> wus = new ArrayList<>(numWorkUnits);
    for (int i = 1; i <= numWorkUnits; ++i) {
        WorkUnit wu = new WorkUnit(extract);
        wus.add(wu);
    }
    return wus;
}
Also used : ArrayList(java.util.ArrayList) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 53 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class SalesforceSourceTest method testSourceLineageInfo.

@Test
void testSourceLineageInfo() {
    SourceState sourceState = new SourceState();
    sourceState.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "salesforce");
    sourceState.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, "snapshot_append");
    sourceState.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true);
    sourceState.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, "20140213000000,20170407152123");
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT");
    QueryBasedSource.SourceEntity sourceEntity = QueryBasedSource.SourceEntity.fromSourceEntityName("contacts");
    SalesforceSource source = new SalesforceSource(new LineageInfo(ConfigFactory.empty()));
    List<WorkUnit> workUnits = source.generateWorkUnits(sourceEntity, sourceState, 20140213000000L);
    Assert.assertEquals(workUnits.size(), 1);
    DatasetDescriptor sourceDataset = new DatasetDescriptor("salesforce", "contacts");
    Gson gson = new Gson();
    Assert.assertEquals(gson.toJson(sourceDataset), workUnits.get(0).getProp("gobblin.event.lineage.source"));
    Assert.assertEquals(workUnits.get(0).getProp("gobblin.event.lineage.name"), "contacts");
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) QueryBasedSource(org.apache.gobblin.source.extractor.extract.QueryBasedSource) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) Gson(com.google.gson.Gson) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) LineageInfo(org.apache.gobblin.metrics.event.lineage.LineageInfo) Test(org.testng.annotations.Test)

Example 54 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class WorstFitDecreasingBinPacking method pack.

@Override
@OverridingMethodsMustInvokeSuper
public List<WorkUnit> pack(List<WorkUnit> workUnitsIn, WorkUnitWeighter weighter) {
    if (this.maxWeightPerUnit <= 0) {
        // just return the input
        return workUnitsIn;
    }
    List<WorkUnit> workUnits = Lists.newArrayList(workUnitsIn);
    // total size of work units smaller than maxWeightPerUnit
    long smallUnitSize = 0;
    // number of work units larger than maxWeightPerUnit
    int largeUnits = 0;
    for (WorkUnit workUnit : workUnits) {
        long weight = weighter.weight(workUnit);
        if (weight <= this.maxWeightPerUnit) {
            smallUnitSize += weight;
        } else {
            largeUnits++;
        }
    }
    int estimateByWeight = largeUnits + (int) ((smallUnitSize - 1) / this.maxWeightPerUnit) + 1;
    int estimatedMultiWorkUnits = Math.min(estimateByWeight, workUnits.size());
    MinMaxPriorityQueue<MultiWorkUnit> pQueue = MinMaxPriorityQueue.orderedBy(new MultiWorkUnitComparator()).create();
    for (int i = 0; i < estimatedMultiWorkUnits; i++) {
        pQueue.add(MultiWorkUnit.createEmpty());
    }
    Collections.sort(workUnits, Collections.reverseOrder(new WeightComparator(weighter)));
    for (WorkUnit workUnit : workUnits) {
        MultiWorkUnit lightestMultiWorkUnit = pQueue.peek();
        long weight = Math.max(1, weighter.weight(workUnit));
        long multiWorkUnitWeight = getMultiWorkUnitWeight(lightestMultiWorkUnit);
        if (multiWorkUnitWeight == 0 || (weight + multiWorkUnitWeight <= this.maxWeightPerUnit && weight + multiWorkUnitWeight > multiWorkUnitWeight)) {
            // check for overflow
            // if it fits, add it to lightest work unit
            addToMultiWorkUnit(lightestMultiWorkUnit, workUnit, weight);
            pQueue.poll();
            pQueue.add(lightestMultiWorkUnit);
        } else {
            // if doesn't fit in lightest multi work unit, create a new work unit for it
            MultiWorkUnit newMultiWorkUnit = MultiWorkUnit.createEmpty();
            addToMultiWorkUnit(newMultiWorkUnit, workUnit, weight);
            pQueue.add(newMultiWorkUnit);
        }
    }
    return Lists.<WorkUnit>newArrayList(Iterables.filter(pQueue, new Predicate<MultiWorkUnit>() {

        @Override
        @edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP_PARAMETER_MUST_BE_NONNULL_BUT_MARKED_AS_NULLABLE", justification = "Allowing nullable values")
        public boolean apply(@Nullable MultiWorkUnit input) {
            return getMultiWorkUnitWeight(input) > 0;
        }
    }));
}
Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Nullable(javax.annotation.Nullable) Predicate(com.google.common.base.Predicate) OverridingMethodsMustInvokeSuper(javax.annotation.OverridingMethodsMustInvokeSuper)

Example 55 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class TestStressTestingSource method testRunDuration.

@Test(enabled = false)
public void testRunDuration() throws DataRecordException, IOException {
    final int MEM_ALLOC_BYTES = 100;
    final int NUM_WORK_UNITS = 1;
    final int SLEEP_TIME_MICRO = 1000;
    // this config is ignored since the duration is set
    final int NUM_RECORDS = 30;
    final int RUN_DURATION_SECS = 5;
    SourceState state = new SourceState();
    state.setProp(StressTestingSource.NUM_WORK_UNITS_KEY, NUM_WORK_UNITS);
    state.setProp(StressTestingSource.MEM_ALLOC_BYTES_KEY, MEM_ALLOC_BYTES);
    state.setProp(StressTestingSource.SLEEP_TIME_MICRO_KEY, SLEEP_TIME_MICRO);
    state.setProp(StressTestingSource.NUM_RECORDS_KEY, NUM_RECORDS);
    state.setProp(StressTestingSource.RUN_DURATION_KEY, RUN_DURATION_SECS);
    StressTestingSource source = new StressTestingSource();
    List<WorkUnit> wus = source.getWorkunits(state);
    Assert.assertEquals(wus.size(), NUM_WORK_UNITS);
    WorkUnit wu = wus.get(0);
    WorkUnitState wuState = new WorkUnitState(wu, state);
    Extractor<String, byte[]> extractor = source.getExtractor(wuState);
    byte[] record;
    long startTimeNano = System.nanoTime();
    while ((record = extractor.readRecord(null)) != null) {
        Assert.assertEquals(record.length, 100);
    }
    long endTimeNano = System.nanoTime();
    long timeSpentMicro = (endTimeNano - startTimeNano) / (1000);
    // check that there is less than 1 second difference between expected and actual time spent
    Assert.assertTrue(Math.abs(timeSpentMicro - (RUN_DURATION_SECS * 1000000)) < (1000000), "Time spent " + timeSpentMicro);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Aggregations

WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)133 Test (org.testng.annotations.Test)59 SourceState (org.apache.gobblin.configuration.SourceState)40 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)40 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)35 Extract (org.apache.gobblin.source.workunit.Extract)24 Path (org.apache.hadoop.fs.Path)19 State (org.apache.gobblin.configuration.State)13 IOException (java.io.IOException)11 ArrayList (java.util.ArrayList)10 Closer (com.google.common.io.Closer)9 Properties (java.util.Properties)9 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)8 List (java.util.List)7 Table (org.apache.hadoop.hive.ql.metadata.Table)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Config (com.typesafe.config.Config)6 File (java.io.File)6 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6