Examples with WorkUnit - org.apache.gobblin.source.workunit.WorkUnit

Example 16 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class HiveMaterializerSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    try {
        FileSystem fs = HadoopUtils.getSourceFileSystem(state);
        Config config = ConfigUtils.propertiesToConfig(state.getProperties());
        if (state.contains(COPY_TABLE_KEY)) {
            HiveDataset dataset = getHiveDataset(state.getProp(COPY_TABLE_KEY), fs, state);
            WorkUnit workUnit = HiveMaterializer.tableCopyWorkUnit(dataset, new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null);
            HiveTask.disableHiveWatermarker(workUnit);
            return Lists.newArrayList(workUnit);
        } else if (state.contains(MATERIALIZE_VIEW)) {
            HiveDataset dataset = getHiveDataset(state.getProp(MATERIALIZE_VIEW), fs, state);
            WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(dataset, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null);
            HiveTask.disableHiveWatermarker(workUnit);
            return Lists.newArrayList(workUnit);
        } else if (state.contains(MATERIALIZE_QUERY)) {
            String query = state.getProp(MATERIALIZE_QUERY);
            WorkUnit workUnit = HiveMaterializer.queryResultMaterializationWorkUnit(query, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), null));
            HiveTask.disableHiveWatermarker(workUnit);
            return Lists.newArrayList(workUnit);
        }
    } catch (IOException ioe) {
        throw new RuntimeException(ioe);
    }
    throw new RuntimeException(String.format("Must specify either %s, %s, or %s.", COPY_TABLE_KEY, MATERIALIZE_QUERY, MATERIALIZE_VIEW));
}

Also used : Config(com.typesafe.config.Config) FileSystem(org.apache.hadoop.fs.FileSystem) HiveDataset(org.apache.gobblin.data.management.copy.hive.HiveDataset) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) IOException(java.io.IOException) StageableTableMetadata(org.apache.gobblin.data.management.conversion.hive.entities.StageableTableMetadata)

Example 17 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class KafkaSimpleStreamingTest method getStreamingExtractor.

private KafkaSimpleStreamingExtractor<String, byte[]> getStreamingExtractor(String topic) {
    _kafkaTestHelper.provisionTopic(topic);
    List<WorkUnit> lWu = getWorkUnits(topic);
    WorkUnit wU = lWu.get(0);
    WorkUnitState wSU = new WorkUnitState(wU, new State());
    wSU.setProp(ConfigurationKeys.KAFKA_BROKERS, "localhost:" + _kafkaTestHelper.getKafkaServerPort());
    wSU.setProp(KafkaSimpleStreamingSource.TOPIC_WHITELIST, topic);
    wSU.setProp(ConfigurationKeys.JOB_NAME_KEY, topic);
    wSU.setProp(KafkaSimpleStreamingSource.TOPIC_KEY_DESERIALIZER, "org.apache.kafka.common.serialization.StringDeserializer");
    wSU.setProp(KafkaSimpleStreamingSource.TOPIC_VALUE_DESERIALIZER, "org.apache.kafka.common.serialization.ByteArrayDeserializer");
    // Create an extractor
    return new KafkaSimpleStreamingExtractor<String, byte[]>(wSU);
}

Also used : KafkaSimpleStreamingExtractor(org.apache.gobblin.source.extractor.extract.kafka.KafkaSimpleStreamingExtractor) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) SourceState(org.apache.gobblin.configuration.SourceState) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 18 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class KafkaSimpleStreamingSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    Config config = ConfigUtils.propertiesToConfig(state.getProperties());
    Consumer<String, byte[]> consumer = getKafkaConsumer(config);
    LOG.debug("Consumer is {}", consumer);
    String topic = ConfigUtils.getString(config, TOPIC_WHITELIST, // TODO: fix this to use the new API when KafkaWrapper is fixed
    StringUtils.EMPTY);
    List<WorkUnit> workUnits = new ArrayList<WorkUnit>();
    List<PartitionInfo> topicPartitions;
    topicPartitions = consumer.partitionsFor(topic);
    LOG.info("Partition count is {}", topicPartitions.size());
    for (PartitionInfo topicPartition : topicPartitions) {
        Extract extract = this.createExtract(DEFAULT_TABLE_TYPE, DEFAULT_NAMESPACE_NAME, topicPartition.topic());
        LOG.info("Partition info is {}", topicPartition);
        WorkUnit workUnit = WorkUnit.create(extract);
        setTopicNameInState(workUnit, topicPartition.topic());
        workUnit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, topicPartition.topic());
        setPartitionId(workUnit, topicPartition.partition());
        workUnits.add(workUnit);
    }
    return workUnits;
}

Also used : Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) PartitionInfo(org.apache.kafka.common.PartitionInfo)

Example 19 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class Kafka09JsonIntegrationTest method testHappyPath.

@Test
public void testHappyPath() throws IOException, DataRecordException {
    String topic = "testKafka09JsonSource";
    kafkaTestHelper.provisionTopic(topic);
    SourceState state = createSourceState(topic);
    // Produce a record
    state.setProp(KAFKA_PRODUCER_CONFIG_PREFIX + "bootstrap.servers", "localhost:" + kafkaTestHelper.getKafkaServerPort());
    state.setProp(KAFKA_TOPIC, topic);
    Destination destination = Destination.of(Destination.DestinationType.KAFKA, state);
    Kafka09JsonObjectWriterBuilder writerBuilder = new Kafka09JsonObjectWriterBuilder();
    writerBuilder.writeTo(destination);
    DataWriter<JsonObject> writer = writerBuilder.build();
    final String json = "{\"number\":27}";
    JsonObject record = gson.fromJson(json, JsonObject.class);
    writer.write(record);
    writer.flush();
    writer.close();
    Kafka09JsonSource source = new Kafka09JsonSource();
    List<WorkUnit> workUnitList = source.getWorkunits(state);
    // Test the right value serializer is set
    Assert.assertEquals(state.getProp(Kafka09ConsumerClient.GOBBLIN_CONFIG_VALUE_DESERIALIZER_CLASS_KEY), Kafka09JsonSource.KafkaGsonDeserializer.class.getName());
    // Test there is only one non-empty work unit
    MultiWorkUnitUnpackingIterator iterator = new MultiWorkUnitUnpackingIterator(workUnitList.iterator());
    Assert.assertTrue(iterator.hasNext());
    WorkUnit workUnit = iterator.next();
    Assert.assertEquals(workUnit.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY), topic);
    Assert.assertFalse(iterator.hasNext());
    // Test extractor
    WorkUnitState workUnitState = new WorkUnitState(workUnit, state);
    final String jsonSchema = "[{\"columnName\":\"number\",\"comment\":\"\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"int\"}}]";
    workUnitState.setProp("source.kafka.json.schema", jsonSchema);
    Extractor<JsonArray, JsonObject> extractor = source.getExtractor(workUnitState);
    Assert.assertEquals(extractor.getSchema().toString(), jsonSchema);
    Assert.assertEquals(extractor.readRecord(null).toString(), json);
}

Also used : Destination(org.apache.gobblin.writer.Destination) SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) JsonObject(com.google.gson.JsonObject) JsonArray(com.google.gson.JsonArray) Kafka09JsonSource(org.apache.gobblin.source.extractor.extract.kafka.Kafka09JsonSource) MultiWorkUnitUnpackingIterator(org.apache.gobblin.runtime.util.MultiWorkUnitUnpackingIterator) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Kafka09JsonObjectWriterBuilder(org.apache.gobblin.kafka.writer.Kafka09JsonObjectWriterBuilder) Test(org.testng.annotations.Test)

Example 20 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class AbstractJobLauncher method cleanLeftoverStagingData.

/**
 * Cleanup the left-over staging data possibly from the previous run of the job that may have failed
 * and not cleaned up its staging data.
 *
 * Property {@link ConfigurationKeys#CLEANUP_STAGING_DATA_PER_TASK} controls whether to cleanup
 * staging data per task, or to cleanup entire job's staging data at once.
 *
 * Staging data will not be cleaned if the job has unfinished {@link CommitSequence}s.
 */
private void cleanLeftoverStagingData(WorkUnitStream workUnits, JobState jobState) throws JobException {
    if (jobState.getPropAsBoolean(ConfigurationKeys.CLEANUP_STAGING_DATA_BY_INITIALIZER, false)) {
        // Clean up will be done by initializer.
        return;
    }
    try {
        if (!canCleanStagingData(jobState)) {
            LOG.error("Job " + jobState.getJobName() + " has unfinished commit sequences. Will not clean up staging data.");
            return;
        }
    } catch (IOException e) {
        throw new JobException("Failed to check unfinished commit sequences", e);
    }
    try {
        if (this.jobContext.shouldCleanupStagingDataPerTask()) {
            if (workUnits.isSafeToMaterialize()) {
                Closer closer = Closer.create();
                Map<String, ParallelRunner> parallelRunners = Maps.newHashMap();
                try {
                    for (WorkUnit workUnit : JobLauncherUtils.flattenWorkUnits(workUnits.getMaterializedWorkUnitCollection())) {
                        JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(workUnit, jobState), LOG, closer, parallelRunners);
                    }
                } catch (Throwable t) {
                    throw closer.rethrow(t);
                } finally {
                    closer.close();
                }
            } else {
                throw new RuntimeException("Work unit streams do not support cleaning staging data per task.");
            }
        } else {
            if (jobState.getPropAsBoolean(ConfigurationKeys.CLEANUP_OLD_JOBS_DATA, ConfigurationKeys.DEFAULT_CLEANUP_OLD_JOBS_DATA)) {
                JobLauncherUtils.cleanUpOldJobData(jobState, LOG, jobContext.getStagingDirProvided(), jobContext.getOutputDirProvided());
            }
            JobLauncherUtils.cleanJobStagingData(jobState, LOG);
        }
    } catch (Throwable t) {
        // Catch Throwable instead of just IOException to make sure failure of this won't affect the current run
        LOG.error("Failed to clean leftover staging data", t);
    }
}

Also used : Closer(com.google.common.io.Closer) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) IOException(java.io.IOException) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) ParallelRunner(org.apache.gobblin.util.ParallelRunner)

Aggregations

WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)133 Test (org.testng.annotations.Test)59 SourceState (org.apache.gobblin.configuration.SourceState)40 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)40 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)35 Extract (org.apache.gobblin.source.workunit.Extract)24 Path (org.apache.hadoop.fs.Path)19 State (org.apache.gobblin.configuration.State)13 IOException (java.io.IOException)11 ArrayList (java.util.ArrayList)10 Closer (com.google.common.io.Closer)9 Properties (java.util.Properties)9 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)8 List (java.util.List)7 Table (org.apache.hadoop.hive.ql.metadata.Table)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Config (com.typesafe.config.Config)6 File (java.io.File)6 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6