use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class GobblinTestEventBusWriterTest method testBuilder.
@Test
public void testBuilder() throws IOException, InterruptedException, TimeoutException {
final String eventBusId = "/GobblinTestEventBusWriterTest/testBuilder";
GobblinTestEventBusWriter.Builder writerBuilder = new GobblinTestEventBusWriter.Builder();
WorkUnit wu = WorkUnit.createEmpty();
wu.setProp(GobblinTestEventBusWriter.FULL_EVENTBUSID_KEY, eventBusId);
writerBuilder.writeTo(Destination.of(DestinationType.HDFS, wu));
Assert.assertEquals(writerBuilder.getEventBusId(), eventBusId);
try (TestingEventBusAsserter asserter = new TestingEventBusAsserter(eventBusId)) {
GobblinTestEventBusWriter writer = writerBuilder.build();
writer.write("event1");
writer.write("event2");
asserter.assertNextValueEq("event1");
asserter.assertNextValueEq("event2");
Assert.assertEquals(writer.recordsWritten(), 2);
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class OldApiHadoopFileInputSourceTest method testGetWorkUnitsAndExtractor.
@Test
public void testGetWorkUnitsAndExtractor() throws IOException, DataRecordException {
OldApiHadoopFileInputSource<String, Text, LongWritable, Text> fileInputSource = new TestHadoopFileInputSource();
List<WorkUnit> workUnitList = fileInputSource.getWorkunits(this.sourceState);
Assert.assertEquals(workUnitList.size(), 1);
WorkUnitState workUnitState = new WorkUnitState(workUnitList.get(0));
Closer closer = Closer.create();
try {
OldApiHadoopFileInputExtractor<String, Text, LongWritable, Text> extractor = (OldApiHadoopFileInputExtractor<String, Text, LongWritable, Text>) fileInputSource.getExtractor(workUnitState);
Text text = extractor.readRecord(null);
Assert.assertEquals(text.toString(), TEXT);
Assert.assertNull(extractor.readRecord(null));
} catch (Throwable t) {
throw closer.rethrow(t);
} finally {
closer.close();
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HiveMaterializer method queryResultMaterializationWorkUnit.
/**
* Create a work unit to materialize a query to a target table using a staging table in between.
* @param query the query to materialize.
* @param storageFormat format in which target table should be written.
* @param destinationTable {@link StageableTableMetadata} specifying staging and target tables metadata.
*/
public static WorkUnit queryResultMaterializationWorkUnit(String query, HiveConverterUtils.StorageFormat storageFormat, StageableTableMetadata destinationTable) {
WorkUnit workUnit = new WorkUnit();
workUnit.setProp(MATERIALIZER_MODE_KEY, MaterializerMode.QUERY_RESULT_MATERIALIZATION.name());
workUnit.setProp(STORAGE_FORMAT_KEY, storageFormat.name());
workUnit.setProp(QUERY_RESULT_TO_MATERIALIZE_KEY, query);
workUnit.setProp(STAGEABLE_TABLE_METADATA_KEY, HiveSource.GENERICS_AWARE_GSON.toJson(destinationTable));
TaskUtils.setTaskFactoryClass(workUnit, HiveMaterializerTaskFactory.class);
HiveTask.disableHiveWatermarker(workUnit);
return workUnit;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class DatasetFinderSource method createWorkUnitStream.
private Stream<WorkUnit> createWorkUnitStream(SourceState state) throws IOException {
IterableDatasetFinder datasetsFinder = createDatasetsFinder(state);
Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(0, null);
if (this.drilldownIntoPartitions) {
return datasetStream.flatMap(dataset -> {
if (dataset instanceof PartitionableDataset) {
try {
return (Stream<PartitionableDataset.DatasetPartition>) ((PartitionableDataset) dataset).getPartitions(0, null);
} catch (IOException ioe) {
log.error("Failed to get partitions for dataset " + dataset.getUrn());
return Stream.empty();
}
} else {
return Stream.of(new DatasetWrapper(dataset));
}
}).map(this::workUnitForPartitionInternal);
} else {
return datasetStream.map(this::workUnitForDataset);
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class WikipediaSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
Map<String, Iterable<WorkUnitState>> previousWorkUnits = state.getPreviousWorkUnitStatesByDatasetUrns();
List<String> titles = new LinkedList<>(Splitter.on(",").omitEmptyStrings().splitToList(state.getProp(WikipediaExtractor.SOURCE_PAGE_TITLES)));
Map<String, LongWatermark> prevHighWatermarks = Maps.newHashMap();
for (Map.Entry<String, Iterable<WorkUnitState>> entry : previousWorkUnits.entrySet()) {
Iterable<LongWatermark> watermarks = Iterables.transform(entry.getValue(), new Function<WorkUnitState, LongWatermark>() {
@Override
public LongWatermark apply(WorkUnitState wus) {
return wus.getActualHighWatermark(LongWatermark.class);
}
});
watermarks = Iterables.filter(watermarks, Predicates.notNull());
List<LongWatermark> watermarkList = Lists.newArrayList(watermarks);
if (watermarkList.size() > 0) {
prevHighWatermarks.put(entry.getKey(), Collections.max(watermarkList));
}
}
Extract extract = createExtract(TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), "WikipediaOutput");
List<WorkUnit> workUnits = Lists.newArrayList();
for (String title : titles) {
LongWatermark prevWatermark = prevHighWatermarks.containsKey(title) ? prevHighWatermarks.get(title) : new LongWatermark(-1);
prevHighWatermarks.remove(title);
WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1)));
workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title);
workUnits.add(workUnit);
}
for (Map.Entry<String, LongWatermark> nonProcessedDataset : prevHighWatermarks.entrySet()) {
WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(), nonProcessedDataset.getValue()));
workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
workUnits.add(workUnit);
}
return workUnits;
}
Aggregations