Search in sources :

Example 11 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class GobblinTestEventBusWriterTest method testBuilder.

@Test
public void testBuilder() throws IOException, InterruptedException, TimeoutException {
    final String eventBusId = "/GobblinTestEventBusWriterTest/testBuilder";
    GobblinTestEventBusWriter.Builder writerBuilder = new GobblinTestEventBusWriter.Builder();
    WorkUnit wu = WorkUnit.createEmpty();
    wu.setProp(GobblinTestEventBusWriter.FULL_EVENTBUSID_KEY, eventBusId);
    writerBuilder.writeTo(Destination.of(DestinationType.HDFS, wu));
    Assert.assertEquals(writerBuilder.getEventBusId(), eventBusId);
    try (TestingEventBusAsserter asserter = new TestingEventBusAsserter(eventBusId)) {
        GobblinTestEventBusWriter writer = writerBuilder.build();
        writer.write("event1");
        writer.write("event2");
        asserter.assertNextValueEq("event1");
        asserter.assertNextValueEq("event2");
        Assert.assertEquals(writer.recordsWritten(), 2);
    }
}
Also used : WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 12 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class OldApiHadoopFileInputSourceTest method testGetWorkUnitsAndExtractor.

@Test
public void testGetWorkUnitsAndExtractor() throws IOException, DataRecordException {
    OldApiHadoopFileInputSource<String, Text, LongWritable, Text> fileInputSource = new TestHadoopFileInputSource();
    List<WorkUnit> workUnitList = fileInputSource.getWorkunits(this.sourceState);
    Assert.assertEquals(workUnitList.size(), 1);
    WorkUnitState workUnitState = new WorkUnitState(workUnitList.get(0));
    Closer closer = Closer.create();
    try {
        OldApiHadoopFileInputExtractor<String, Text, LongWritable, Text> extractor = (OldApiHadoopFileInputExtractor<String, Text, LongWritable, Text>) fileInputSource.getExtractor(workUnitState);
        Text text = extractor.readRecord(null);
        Assert.assertEquals(text.toString(), TEXT);
        Assert.assertNull(extractor.readRecord(null));
    } catch (Throwable t) {
        throw closer.rethrow(t);
    } finally {
        closer.close();
    }
}
Also used : Closer(com.google.common.io.Closer) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Text(org.apache.hadoop.io.Text) LongWritable(org.apache.hadoop.io.LongWritable) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 13 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HiveMaterializer method queryResultMaterializationWorkUnit.

/**
 * Create a work unit to materialize a query to a target table using a staging table in between.
 * @param query the query to materialize.
 * @param storageFormat format in which target table should be written.
 * @param destinationTable {@link StageableTableMetadata} specifying staging and target tables metadata.
 */
public static WorkUnit queryResultMaterializationWorkUnit(String query, HiveConverterUtils.StorageFormat storageFormat, StageableTableMetadata destinationTable) {
    WorkUnit workUnit = new WorkUnit();
    workUnit.setProp(MATERIALIZER_MODE_KEY, MaterializerMode.QUERY_RESULT_MATERIALIZATION.name());
    workUnit.setProp(STORAGE_FORMAT_KEY, storageFormat.name());
    workUnit.setProp(QUERY_RESULT_TO_MATERIALIZE_KEY, query);
    workUnit.setProp(STAGEABLE_TABLE_METADATA_KEY, HiveSource.GENERICS_AWARE_GSON.toJson(destinationTable));
    TaskUtils.setTaskFactoryClass(workUnit, HiveMaterializerTaskFactory.class);
    HiveTask.disableHiveWatermarker(workUnit);
    return workUnit;
}
Also used : HiveWorkUnit(org.apache.gobblin.data.management.conversion.hive.source.HiveWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 14 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class DatasetFinderSource method createWorkUnitStream.

private Stream<WorkUnit> createWorkUnitStream(SourceState state) throws IOException {
    IterableDatasetFinder datasetsFinder = createDatasetsFinder(state);
    Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(0, null);
    if (this.drilldownIntoPartitions) {
        return datasetStream.flatMap(dataset -> {
            if (dataset instanceof PartitionableDataset) {
                try {
                    return (Stream<PartitionableDataset.DatasetPartition>) ((PartitionableDataset) dataset).getPartitions(0, null);
                } catch (IOException ioe) {
                    log.error("Failed to get partitions for dataset " + dataset.getUrn());
                    return Stream.empty();
                }
            } else {
                return Stream.of(new DatasetWrapper(dataset));
            }
        }).map(this::workUnitForPartitionInternal);
    } else {
        return datasetStream.map(this::workUnitForDataset);
    }
}
Also used : DatasetUtils(org.apache.gobblin.data.management.dataset.DatasetUtils) WorkUnitStream(org.apache.gobblin.source.workunit.WorkUnitStream) Getter(lombok.Getter) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) List(java.util.List) Slf4j(lombok.extern.slf4j.Slf4j) Stream(java.util.stream.Stream) BasicWorkUnitStream(org.apache.gobblin.source.workunit.BasicWorkUnitStream) SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitStreamSource(org.apache.gobblin.source.WorkUnitStreamSource) HadoopUtils(org.apache.gobblin.util.HadoopUtils) AllArgsConstructor(lombok.AllArgsConstructor) Dataset(org.apache.gobblin.dataset.Dataset) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) IterableDatasetFinder(org.apache.gobblin.dataset.IterableDatasetFinder) PartitionableDataset(org.apache.gobblin.dataset.PartitionableDataset) Dataset(org.apache.gobblin.dataset.Dataset) IOException(java.io.IOException)

Example 15 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class WikipediaSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    Map<String, Iterable<WorkUnitState>> previousWorkUnits = state.getPreviousWorkUnitStatesByDatasetUrns();
    List<String> titles = new LinkedList<>(Splitter.on(",").omitEmptyStrings().splitToList(state.getProp(WikipediaExtractor.SOURCE_PAGE_TITLES)));
    Map<String, LongWatermark> prevHighWatermarks = Maps.newHashMap();
    for (Map.Entry<String, Iterable<WorkUnitState>> entry : previousWorkUnits.entrySet()) {
        Iterable<LongWatermark> watermarks = Iterables.transform(entry.getValue(), new Function<WorkUnitState, LongWatermark>() {

            @Override
            public LongWatermark apply(WorkUnitState wus) {
                return wus.getActualHighWatermark(LongWatermark.class);
            }
        });
        watermarks = Iterables.filter(watermarks, Predicates.notNull());
        List<LongWatermark> watermarkList = Lists.newArrayList(watermarks);
        if (watermarkList.size() > 0) {
            prevHighWatermarks.put(entry.getKey(), Collections.max(watermarkList));
        }
    }
    Extract extract = createExtract(TableType.SNAPSHOT_ONLY, state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY), "WikipediaOutput");
    List<WorkUnit> workUnits = Lists.newArrayList();
    for (String title : titles) {
        LongWatermark prevWatermark = prevHighWatermarks.containsKey(title) ? prevHighWatermarks.get(title) : new LongWatermark(-1);
        prevHighWatermarks.remove(title);
        WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(prevWatermark, new LongWatermark(-1)));
        workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, title);
        workUnits.add(workUnit);
    }
    for (Map.Entry<String, LongWatermark> nonProcessedDataset : prevHighWatermarks.entrySet()) {
        WorkUnit workUnit = WorkUnit.create(extract, new WatermarkInterval(nonProcessedDataset.getValue(), nonProcessedDataset.getValue()));
        workUnit.setProp(ConfigurationKeys.DATASET_URN_KEY, nonProcessedDataset.getKey());
        workUnits.add(workUnit);
    }
    return workUnits;
}
Also used : WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Extract(org.apache.gobblin.source.workunit.Extract) LinkedList(java.util.LinkedList) WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Map(java.util.Map) LongWatermark(org.apache.gobblin.source.extractor.extract.LongWatermark)

Aggregations

WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)133 Test (org.testng.annotations.Test)59 SourceState (org.apache.gobblin.configuration.SourceState)40 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)40 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)35 Extract (org.apache.gobblin.source.workunit.Extract)24 Path (org.apache.hadoop.fs.Path)19 State (org.apache.gobblin.configuration.State)13 IOException (java.io.IOException)11 ArrayList (java.util.ArrayList)10 Closer (com.google.common.io.Closer)9 Properties (java.util.Properties)9 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)8 List (java.util.List)7 Table (org.apache.hadoop.hive.ql.metadata.Table)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Config (com.typesafe.config.Config)6 File (java.io.File)6 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6