use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HelloWorldSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
Config rootCfg = ConfigUtils.propertiesToConfig(state.getProperties());
Config cfg = rootCfg.hasPath(CONFIG_NAMESPACE) ? rootCfg.getConfig(CONFIG_NAMESPACE) : ConfigFactory.empty();
int numHellos = cfg.hasPath(NUM_HELLOS_KEY) ? cfg.getInt(NUM_HELLOS_KEY) : DEFAULT_NUM_HELLOS;
Extract extract = new Extract(TableType.APPEND_ONLY, HelloWorldSource.class.getPackage().getName(), HelloWorldSource.class.getSimpleName());
List<WorkUnit> wus = new ArrayList<>(numHellos);
for (int i = 1; i <= numHellos; ++i) {
WorkUnit wu = new WorkUnit(extract);
wu.setProp(HELLO_ID_FULL_KEY, i);
wus.add(wu);
}
return wus;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class StressTestingSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
int numWorkUnits = state.getPropAsInt(NUM_WORK_UNITS_KEY, DEFAULT_NUM_WORK_UNITS);
Extract extract = new Extract(TableType.APPEND_ONLY, StressTestingSource.class.getPackage().getName(), StressTestingSource.class.getSimpleName());
List<WorkUnit> wus = new ArrayList<>(numWorkUnits);
for (int i = 1; i <= numWorkUnits; ++i) {
WorkUnit wu = new WorkUnit(extract);
wus.add(wu);
}
return wus;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class SalesforceSourceTest method testSourceLineageInfo.
@Test
void testSourceLineageInfo() {
SourceState sourceState = new SourceState();
sourceState.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "salesforce");
sourceState.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, "snapshot_append");
sourceState.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true);
sourceState.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, "20140213000000,20170407152123");
sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT");
QueryBasedSource.SourceEntity sourceEntity = QueryBasedSource.SourceEntity.fromSourceEntityName("contacts");
SalesforceSource source = new SalesforceSource(new LineageInfo(ConfigFactory.empty()));
List<WorkUnit> workUnits = source.generateWorkUnits(sourceEntity, sourceState, 20140213000000L);
Assert.assertEquals(workUnits.size(), 1);
DatasetDescriptor sourceDataset = new DatasetDescriptor("salesforce", "contacts");
Gson gson = new Gson();
Assert.assertEquals(gson.toJson(sourceDataset), workUnits.get(0).getProp("gobblin.event.lineage.source"));
Assert.assertEquals(workUnits.get(0).getProp("gobblin.event.lineage.name"), "contacts");
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class WorstFitDecreasingBinPacking method pack.
@Override
@OverridingMethodsMustInvokeSuper
public List<WorkUnit> pack(List<WorkUnit> workUnitsIn, WorkUnitWeighter weighter) {
if (this.maxWeightPerUnit <= 0) {
// just return the input
return workUnitsIn;
}
List<WorkUnit> workUnits = Lists.newArrayList(workUnitsIn);
// total size of work units smaller than maxWeightPerUnit
long smallUnitSize = 0;
// number of work units larger than maxWeightPerUnit
int largeUnits = 0;
for (WorkUnit workUnit : workUnits) {
long weight = weighter.weight(workUnit);
if (weight <= this.maxWeightPerUnit) {
smallUnitSize += weight;
} else {
largeUnits++;
}
}
int estimateByWeight = largeUnits + (int) ((smallUnitSize - 1) / this.maxWeightPerUnit) + 1;
int estimatedMultiWorkUnits = Math.min(estimateByWeight, workUnits.size());
MinMaxPriorityQueue<MultiWorkUnit> pQueue = MinMaxPriorityQueue.orderedBy(new MultiWorkUnitComparator()).create();
for (int i = 0; i < estimatedMultiWorkUnits; i++) {
pQueue.add(MultiWorkUnit.createEmpty());
}
Collections.sort(workUnits, Collections.reverseOrder(new WeightComparator(weighter)));
for (WorkUnit workUnit : workUnits) {
MultiWorkUnit lightestMultiWorkUnit = pQueue.peek();
long weight = Math.max(1, weighter.weight(workUnit));
long multiWorkUnitWeight = getMultiWorkUnitWeight(lightestMultiWorkUnit);
if (multiWorkUnitWeight == 0 || (weight + multiWorkUnitWeight <= this.maxWeightPerUnit && weight + multiWorkUnitWeight > multiWorkUnitWeight)) {
// check for overflow
// if it fits, add it to lightest work unit
addToMultiWorkUnit(lightestMultiWorkUnit, workUnit, weight);
pQueue.poll();
pQueue.add(lightestMultiWorkUnit);
} else {
// if doesn't fit in lightest multi work unit, create a new work unit for it
MultiWorkUnit newMultiWorkUnit = MultiWorkUnit.createEmpty();
addToMultiWorkUnit(newMultiWorkUnit, workUnit, weight);
pQueue.add(newMultiWorkUnit);
}
}
return Lists.<WorkUnit>newArrayList(Iterables.filter(pQueue, new Predicate<MultiWorkUnit>() {
@Override
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value = "NP_PARAMETER_MUST_BE_NONNULL_BUT_MARKED_AS_NULLABLE", justification = "Allowing nullable values")
public boolean apply(@Nullable MultiWorkUnit input) {
return getMultiWorkUnitWeight(input) > 0;
}
}));
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class TestStressTestingSource method testRunDuration.
@Test(enabled = false)
public void testRunDuration() throws DataRecordException, IOException {
final int MEM_ALLOC_BYTES = 100;
final int NUM_WORK_UNITS = 1;
final int SLEEP_TIME_MICRO = 1000;
// this config is ignored since the duration is set
final int NUM_RECORDS = 30;
final int RUN_DURATION_SECS = 5;
SourceState state = new SourceState();
state.setProp(StressTestingSource.NUM_WORK_UNITS_KEY, NUM_WORK_UNITS);
state.setProp(StressTestingSource.MEM_ALLOC_BYTES_KEY, MEM_ALLOC_BYTES);
state.setProp(StressTestingSource.SLEEP_TIME_MICRO_KEY, SLEEP_TIME_MICRO);
state.setProp(StressTestingSource.NUM_RECORDS_KEY, NUM_RECORDS);
state.setProp(StressTestingSource.RUN_DURATION_KEY, RUN_DURATION_SECS);
StressTestingSource source = new StressTestingSource();
List<WorkUnit> wus = source.getWorkunits(state);
Assert.assertEquals(wus.size(), NUM_WORK_UNITS);
WorkUnit wu = wus.get(0);
WorkUnitState wuState = new WorkUnitState(wu, state);
Extractor<String, byte[]> extractor = source.getExtractor(wuState);
byte[] record;
long startTimeNano = System.nanoTime();
while ((record = extractor.readRecord(null)) != null) {
Assert.assertEquals(record.length, 100);
}
long endTimeNano = System.nanoTime();
long timeSpentMicro = (endTimeNano - startTimeNano) / (1000);
// check that there is less than 1 second difference between expected and actual time spent
Assert.assertTrue(Math.abs(timeSpentMicro - (RUN_DURATION_SECS * 1000000)) < (1000000), "Time spent " + timeSpentMicro);
}
Aggregations