use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class DatePartitionedAvroFileExtractorTest method verifyWorkUnits.
private void verifyWorkUnits(List<WorkUnit> workunits, int expectedSize) throws DataRecordException, IOException {
for (int i = 0; i < expectedSize; i++) {
WorkUnit workUnit = ((MultiWorkUnit) workunits.get(i)).getWorkUnits().get(0);
WorkUnitState wuState = new WorkUnitState(workunits.get(i), new State());
wuState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, ConfigurationKeys.LOCAL_FS_URI);
wuState.setProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL, workUnit.getProp(ConfigurationKeys.SOURCE_FILEBASED_FILES_TO_PULL));
try (DatePartitionedAvroFileExtractor extractor = new DatePartitionedAvroFileExtractor(wuState)) {
GenericRecord record = extractor.readRecord(null);
Assert.assertEquals(recordTimestamps[i], record.get(PARTITION_COLUMN_NAME));
Assert.assertEquals(recordTimestamps[i], workUnit.getPropAsLong(ConfigurationKeys.WORK_UNIT_DATE_PARTITION_KEY));
}
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class FileBasedSourceTest method numberOfWorkUnits.
@Test
void numberOfWorkUnits() throws IOException {
SourceState sourceState = new SourceState();
DatePartitionedJsonFileSource source = new DatePartitionedJsonFileSource();
initState(sourceState);
List<WorkUnit> workUnits = source.getWorkunits(sourceState);
Assert.assertEquals(3, workUnits.size());
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HadoopFileInputSourceTest method testGetWorkUnitsAndExtractor.
@Test
public void testGetWorkUnitsAndExtractor() throws IOException, DataRecordException {
HadoopFileInputSource<String, Text, LongWritable, Text> fileInputSource = new TestHadoopFileInputSource();
List<WorkUnit> workUnitList = fileInputSource.getWorkunits(this.sourceState);
Assert.assertEquals(workUnitList.size(), 1);
WorkUnitState workUnitState = new WorkUnitState(workUnitList.get(0));
Closer closer = Closer.create();
try {
HadoopFileInputExtractor<String, Text, LongWritable, Text> extractor = (HadoopFileInputExtractor<String, Text, LongWritable, Text>) fileInputSource.getExtractor(workUnitState);
Text text = extractor.readRecord(null);
Assert.assertEquals(text.toString(), TEXT);
Assert.assertNull(extractor.readRecord(null));
} catch (Throwable t) {
throw closer.rethrow(t);
} finally {
closer.close();
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaBiLevelWorkUnitPacker method bestFitDecreasingBinPacking.
/**
* Group {@link WorkUnit}s into groups. Each group is a {@link MultiWorkUnit}. Each group has a capacity of
* avgGroupSize. If there's a single {@link WorkUnit} whose size is larger than avgGroupSize, it forms a group itself.
*/
private static List<MultiWorkUnit> bestFitDecreasingBinPacking(List<WorkUnit> workUnits, double avgGroupSize) {
// Sort workunits by data size desc
Collections.sort(workUnits, LOAD_DESC_COMPARATOR);
PriorityQueue<MultiWorkUnit> pQueue = new PriorityQueue<>(workUnits.size(), LOAD_DESC_COMPARATOR);
for (WorkUnit workUnit : workUnits) {
MultiWorkUnit bestGroup = findAndPopBestFitGroup(workUnit, pQueue, avgGroupSize);
if (bestGroup != null) {
addWorkUnitToMultiWorkUnit(workUnit, bestGroup);
} else {
bestGroup = MultiWorkUnit.createEmpty();
addWorkUnitToMultiWorkUnit(workUnit, bestGroup);
}
pQueue.add(bestGroup);
}
return Lists.newArrayList(pQueue);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaWorkUnitPacker method getWatermarkIntervalFromMultiWorkUnit.
@SuppressWarnings("deprecation")
protected static WatermarkInterval getWatermarkIntervalFromMultiWorkUnit(MultiWorkUnit multiWorkUnit) {
List<Long> lowWatermarkValues = Lists.newArrayList();
List<Long> expectedHighWatermarkValues = Lists.newArrayList();
for (WorkUnit workUnit : multiWorkUnit.getWorkUnits()) {
lowWatermarkValues.add(workUnit.getLowWaterMark());
expectedHighWatermarkValues.add(workUnit.getHighWaterMark());
}
return new WatermarkInterval(new MultiLongWatermark(lowWatermarkValues), new MultiLongWatermark(expectedHighWatermarkValues));
}
Aggregations