use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class MultiWorkUnitUnpackingIterator method next.
@Override
public WorkUnit next() {
// In case, the caller forgets to call hasNext()
seekNext();
WorkUnit wu = nextWu;
if (nextWu instanceof MultiWorkUnit) {
wu = this.currentIterator.next();
}
needSeek = true;
return wu;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class NoopTask method noopWorkunit.
/**
* @return A {@link WorkUnit} that will run a {@link NoopTask}.
*/
public static WorkUnit noopWorkunit() {
WorkUnit workUnit = new WorkUnit();
TaskUtils.setTaskFactoryClass(workUnit, Factory.class);
return workUnit;
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class SingleTask method run.
public void run() throws IOException, InterruptedException {
List<WorkUnit> workUnits = getWorkUnits();
JobState jobState = getJobState();
Config jobConfig = getConfigFromJobState(jobState);
_logger.debug("SingleTask.run: jobId {} workUnitFilePath {} jobStateFilePath {} jobState {} jobConfig {}", _jobId, _workUnitFilePath, _jobStateFilePath, jobState, jobConfig);
try (SharedResourcesBroker<GobblinScopeTypes> globalBroker = SharedResourcesBrokerFactory.createDefaultTopLevelBroker(jobConfig, GobblinScopeTypes.GLOBAL.defaultScopeInstance())) {
SharedResourcesBroker<GobblinScopeTypes> jobBroker = getJobBroker(jobState, globalBroker);
_taskattempt = _taskAttemptBuilder.build(workUnits.iterator(), _jobId, jobState, jobBroker);
_taskattempt.runAndOptionallyCommitTaskAttempt(GobblinMultiTaskAttempt.CommitPolicy.IMMEDIATE);
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class PartitionLevelWatermarker method onGetWorkunitsEnd.
/**
* Adds watermark workunits to <code>workunits</code>. A watermark workunit is a dummy workunit that is skipped by extractor/converter/writer.
* It stores a map of watermarks. The map has one entry per partition with partition watermark as value.
* <ul>
* <li>Add one NoOp watermark workunit for each {@link Table}
* <li>The workunit has an identifier property {@link #IS_WATERMARK_WORKUNIT_KEY} set to true.
* <li>Watermarks for all {@link Partition}s that belong to this {@link Table} are added as {@link Map}
* <li>A maximum of {@link #maxPartitionsPerDataset} are persisted. Watermarks are ordered by most recently modified {@link Partition}s
*
* </ul>
* {@inheritDoc}
* @see org.apache.gobblin.data.management.conversion.hive.watermarker.HiveSourceWatermarker#onGetWorkunitsEnd(java.util.List)
*/
@Override
public void onGetWorkunitsEnd(List<WorkUnit> workunits) {
try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) {
for (Map.Entry<String, Map<String, Long>> tableWatermark : this.expectedHighWatermarks.entrySet()) {
String tableKey = tableWatermark.getKey();
Map<String, Long> partitionWatermarks = tableWatermark.getValue();
// tableKey is table complete name in the format db@table
if (!HiveUtils.isPartitioned(new org.apache.hadoop.hive.ql.metadata.Table(client.get().getTable(tableKey.split("@")[0], tableKey.split("@")[1])))) {
continue;
}
// We only keep watermarks for partitions that were updated after leastWatermarkToPersistInState
Map<String, Long> expectedPartitionWatermarks = ImmutableMap.copyOf(Maps.filterEntries(partitionWatermarks, new Predicate<Map.Entry<String, Long>>() {
@Override
public boolean apply(@Nonnull Map.Entry<String, Long> input) {
return Long.compare(input.getValue(), PartitionLevelWatermarker.this.leastWatermarkToPersistInState) >= 0;
}
}));
// Create dummy workunit to track all the partition watermarks for this table
WorkUnit watermarkWorkunit = WorkUnit.createEmpty();
watermarkWorkunit.setProp(IS_WATERMARK_WORKUNIT_KEY, true);
watermarkWorkunit.setProp(ConfigurationKeys.DATASET_URN_KEY, tableKey);
watermarkWorkunit.setWatermarkInterval(new WatermarkInterval(new MultiKeyValueLongWatermark(this.previousWatermarks.get(tableKey)), new MultiKeyValueLongWatermark(expectedPartitionWatermarks)));
workunits.add(watermarkWorkunit);
}
} catch (IOException | TException e) {
Throwables.propagate(e);
}
}
use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.
the class KafkaDeserializerExtractorTest method getMockWorkUnitState.
private WorkUnitState getMockWorkUnitState() {
WorkUnit mockWorkUnit = WorkUnit.createEmpty();
mockWorkUnit.setWatermarkInterval(new WatermarkInterval(new MultiLongWatermark(new ArrayList<Long>()), new MultiLongWatermark(new ArrayList<Long>())));
WorkUnitState mockWorkUnitState = new WorkUnitState(mockWorkUnit, new State());
mockWorkUnitState.setProp(KafkaSource.TOPIC_NAME, TEST_TOPIC_NAME);
mockWorkUnitState.setProp(KafkaSource.PARTITION_ID, "1");
mockWorkUnitState.setProp(ConfigurationKeys.KAFKA_BROKERS, "localhost:8080");
mockWorkUnitState.setProp(KafkaSchemaRegistry.KAFKA_SCHEMA_REGISTRY_URL, TEST_URL);
return mockWorkUnitState;
}
Aggregations