use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class HiveMaterializerSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
try {
FileSystem fs = HadoopUtils.getSourceFileSystem(state);
Config config = ConfigUtils.propertiesToConfig(state.getProperties());
if (state.contains(COPY_TABLE_KEY)) {
HiveDataset dataset = getHiveDataset(state.getProp(COPY_TABLE_KEY), fs, state);
WorkUnit workUnit = HiveMaterializer.tableCopyWorkUnit(dataset, new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null);
HiveTask.disableHiveWatermarker(workUnit);
return Lists.newArrayList(workUnit);
} else if (state.contains(MATERIALIZE_VIEW)) {
HiveDataset dataset = getHiveDataset(state.getProp(MATERIALIZE_VIEW), fs, state);
WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(dataset, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), dataset.getTable()), null);
HiveTask.disableHiveWatermarker(workUnit);
return Lists.newArrayList(workUnit);
} else if (state.contains(MATERIALIZE_QUERY)) {
String query = state.getProp(MATERIALIZE_QUERY);
WorkUnit workUnit = HiveMaterializer.queryResultMaterializationWorkUnit(query, getOutputStorageFormat(state), new StageableTableMetadata(config.getConfig(HIVE_MATERIALIZER_SOURCE_PREFIX), null));
HiveTask.disableHiveWatermarker(workUnit);
return Lists.newArrayList(workUnit);
}
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
throw new RuntimeException(String.format("Must specify either %s, %s, or %s.", COPY_TABLE_KEY, MATERIALIZE_QUERY, MATERIALIZE_VIEW));
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaSimpleStreamingTest method getStreamingExtractor.
private KafkaSimpleStreamingExtractor<String, byte[]> getStreamingExtractor(String topic) {
_kafkaTestHelper.provisionTopic(topic);
List<WorkUnit> lWu = getWorkUnits(topic);
WorkUnit wU = lWu.get(0);
WorkUnitState wSU = new WorkUnitState(wU, new State());
wSU.setProp(ConfigurationKeys.KAFKA_BROKERS, "localhost:" + _kafkaTestHelper.getKafkaServerPort());
wSU.setProp(KafkaSimpleStreamingSource.TOPIC_WHITELIST, topic);
wSU.setProp(ConfigurationKeys.JOB_NAME_KEY, topic);
wSU.setProp(KafkaSimpleStreamingSource.TOPIC_KEY_DESERIALIZER, "org.apache.kafka.common.serialization.StringDeserializer");
wSU.setProp(KafkaSimpleStreamingSource.TOPIC_VALUE_DESERIALIZER, "org.apache.kafka.common.serialization.ByteArrayDeserializer");
// Create an extractor
return new KafkaSimpleStreamingExtractor<String, byte[]>(wSU);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaSimpleStreamingSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
Config config = ConfigUtils.propertiesToConfig(state.getProperties());
Consumer<String, byte[]> consumer = getKafkaConsumer(config);
LOG.debug("Consumer is {}", consumer);
String topic = ConfigUtils.getString(config, TOPIC_WHITELIST, // TODO: fix this to use the new API when KafkaWrapper is fixed
StringUtils.EMPTY);
List<WorkUnit> workUnits = new ArrayList<WorkUnit>();
List<PartitionInfo> topicPartitions;
topicPartitions = consumer.partitionsFor(topic);
LOG.info("Partition count is {}", topicPartitions.size());
for (PartitionInfo topicPartition : topicPartitions) {
Extract extract = this.createExtract(DEFAULT_TABLE_TYPE, DEFAULT_NAMESPACE_NAME, topicPartition.topic());
LOG.info("Partition info is {}", topicPartition);
WorkUnit workUnit = WorkUnit.create(extract);
setTopicNameInState(workUnit, topicPartition.topic());
workUnit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, topicPartition.topic());
setPartitionId(workUnit, topicPartition.partition());
workUnits.add(workUnit);
}
return workUnits;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class Kafka09JsonIntegrationTest method testHappyPath.
@Test
public void testHappyPath() throws IOException, DataRecordException {
String topic = "testKafka09JsonSource";
kafkaTestHelper.provisionTopic(topic);
SourceState state = createSourceState(topic);
// Produce a record
state.setProp(KAFKA_PRODUCER_CONFIG_PREFIX + "bootstrap.servers", "localhost:" + kafkaTestHelper.getKafkaServerPort());
state.setProp(KAFKA_TOPIC, topic);
Destination destination = Destination.of(Destination.DestinationType.KAFKA, state);
Kafka09JsonObjectWriterBuilder writerBuilder = new Kafka09JsonObjectWriterBuilder();
writerBuilder.writeTo(destination);
DataWriter<JsonObject> writer = writerBuilder.build();
final String json = "{\"number\":27}";
JsonObject record = gson.fromJson(json, JsonObject.class);
writer.write(record);
writer.flush();
writer.close();
Kafka09JsonSource source = new Kafka09JsonSource();
List<WorkUnit> workUnitList = source.getWorkunits(state);
// Test the right value serializer is set
Assert.assertEquals(state.getProp(Kafka09ConsumerClient.GOBBLIN_CONFIG_VALUE_DESERIALIZER_CLASS_KEY), Kafka09JsonSource.KafkaGsonDeserializer.class.getName());
// Test there is only one non-empty work unit
MultiWorkUnitUnpackingIterator iterator = new MultiWorkUnitUnpackingIterator(workUnitList.iterator());
Assert.assertTrue(iterator.hasNext());
WorkUnit workUnit = iterator.next();
Assert.assertEquals(workUnit.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY), topic);
Assert.assertFalse(iterator.hasNext());
// Test extractor
WorkUnitState workUnitState = new WorkUnitState(workUnit, state);
final String jsonSchema = "[{\"columnName\":\"number\",\"comment\":\"\",\"isNullable\":\"false\",\"dataType\":{\"type\":\"int\"}}]";
workUnitState.setProp("source.kafka.json.schema", jsonSchema);
Extractor<JsonArray, JsonObject> extractor = source.getExtractor(workUnitState);
Assert.assertEquals(extractor.getSchema().toString(), jsonSchema);
Assert.assertEquals(extractor.readRecord(null).toString(), json);
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class AbstractJobLauncher method cleanLeftoverStagingData.
/**
* Cleanup the left-over staging data possibly from the previous run of the job that may have failed
* and not cleaned up its staging data.
*
* Property {@link ConfigurationKeys#CLEANUP_STAGING_DATA_PER_TASK} controls whether to cleanup
* staging data per task, or to cleanup entire job's staging data at once.
*
* Staging data will not be cleaned if the job has unfinished {@link CommitSequence}s.
*/
private void cleanLeftoverStagingData(WorkUnitStream workUnits, JobState jobState) throws JobException {
if (jobState.getPropAsBoolean(ConfigurationKeys.CLEANUP_STAGING_DATA_BY_INITIALIZER, false)) {
// Clean up will be done by initializer.
return;
}
try {
if (!canCleanStagingData(jobState)) {
LOG.error("Job " + jobState.getJobName() + " has unfinished commit sequences. Will not clean up staging data.");
return;
}
} catch (IOException e) {
throw new JobException("Failed to check unfinished commit sequences", e);
}
try {
if (this.jobContext.shouldCleanupStagingDataPerTask()) {
if (workUnits.isSafeToMaterialize()) {
Closer closer = Closer.create();
Map<String, ParallelRunner> parallelRunners = Maps.newHashMap();
try {
for (WorkUnit workUnit : JobLauncherUtils.flattenWorkUnits(workUnits.getMaterializedWorkUnitCollection())) {
JobLauncherUtils.cleanTaskStagingData(new WorkUnitState(workUnit, jobState), LOG, closer, parallelRunners);
}
} catch (Throwable t) {
throw closer.rethrow(t);
} finally {
closer.close();
}
} else {
throw new RuntimeException("Work unit streams do not support cleaning staging data per task.");
}
} else {
if (jobState.getPropAsBoolean(ConfigurationKeys.CLEANUP_OLD_JOBS_DATA, ConfigurationKeys.DEFAULT_CLEANUP_OLD_JOBS_DATA)) {
JobLauncherUtils.cleanUpOldJobData(jobState, LOG, jobContext.getStagingDirProvided(), jobContext.getOutputDirProvided());
}
JobLauncherUtils.cleanJobStagingData(jobState, LOG);
}
} catch (Throwable t) {
// Catch Throwable instead of just IOException to make sure failure of this won't affect the current run
LOG.error("Failed to clean leftover staging data", t);
}
}
Aggregations