use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class PartitionerTest method testGetLowWatermarkOnSnapshotExtract.
/**
* Test getLowWatermark. Extract type: Snapshot.
*/
@Test
public void testGetLowWatermarkOnSnapshotExtract() {
SourceState sourceState = new SourceState();
String startValue = "20140101000000";
sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_START_VALUE, startValue);
TestPartitioner partitioner = new TestPartitioner(sourceState);
ExtractType extractType = ExtractType.SNAPSHOT;
int delta = 1;
// No previous watermark
Assert.assertEquals(partitioner.getLowWatermark(extractType, null, ConfigurationKeys.DEFAULT_WATERMARK_VALUE, delta), Long.parseLong(startValue), "Low watermark should be " + startValue);
// With previous watermark
long previousWatermark = 20140101000050L;
long expected = previousWatermark + delta;
Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.SIMPLE, previousWatermark, delta), expected, "Low watermark should be " + expected);
Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.TIMESTAMP, previousWatermark, delta), expected, "Low watermark should be " + expected);
// With SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS
int backupSecs = 10;
expected = previousWatermark + delta - backupSecs;
sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, backupSecs);
Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.SIMPLE, previousWatermark, delta), expected, "Low watermark should be " + expected);
Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.TIMESTAMP, previousWatermark, delta), expected, "Low watermark should be " + expected);
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class PartitionerTest method testGetUserSpecifiedPartitionList.
@Test
public void testGetUserSpecifiedPartitionList() {
List<Partition> expectedPartitions = new ArrayList<>();
SourceState sourceState = new SourceState();
sourceState.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true);
TestPartitioner partitioner = new TestPartitioner(sourceState);
long defaultValue = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
expectedPartitions.add(new Partition(defaultValue, defaultValue, true, true));
sourceState.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, "");
// Partition list doesn't exist
Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
// Date partitions
sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "date");
// Only one partition point
sourceState.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, "20140101030201");
expectedPartitions.clear();
expectedPartitions.add(new Partition(20140101000000L, 20170101000000L, true, false));
Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
// Keep upper bounds for append_daily job
sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "APPEND_DAILY");
sourceState.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, "20140101030201, 20140102040201");
expectedPartitions.clear();
expectedPartitions.add(new Partition(20140101000000L, 20140102000000L, true, true));
Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
// Hour partitions, snapshot extract
sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "hour");
sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT");
expectedPartitions.clear();
expectedPartitions.add(new Partition(20140101030000L, 20140102040000L, true, false));
Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
// Hour partitions, timestamp extract
sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "timestamp");
expectedPartitions.clear();
expectedPartitions.add(new Partition(20140101030201L, 20140102040201L, true, false));
Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class PartitionerTest method testGetHighWatermarkOnUserOverride.
/**
* Test getHighWatermark. Is watermark override: true.
*/
@Test
public void testGetHighWatermarkOnUserOverride() {
String endValue = "20140101000000";
SourceState sourceState = new SourceState();
sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_WATERMARK_OVERRIDE, true);
sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_END_VALUE, endValue);
TestPartitioner partitioner = new TestPartitioner(sourceState);
Assert.assertEquals(partitioner.getHighWatermark(null, null), Long.parseLong(endValue), "High watermark should be " + endValue);
Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), true, "Should mark as user specified high watermark");
partitioner.reset();
// Should return current time if no SOURCE_QUERYBASED_END_VALUE is specified
sourceState.removeProp(ConfigurationKeys.SOURCE_QUERYBASED_END_VALUE);
long expected = Long.parseLong(TestPartitioner.currentTimeString);
Assert.assertEquals(partitioner.getHighWatermark(null, null), expected, "High watermark should be " + expected);
Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), false, "Should not mark as user specified high watermark");
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class KafkaSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
this.metricContext = Instrumented.getMetricContext(state, KafkaSource.class);
this.lineageInfo = LineageInfo.getLineageInfo(state.getBroker());
Map<String, List<WorkUnit>> workUnits = Maps.newConcurrentMap();
if (state.getPropAsBoolean(KafkaSource.GOBBLIN_KAFKA_EXTRACT_ALLOW_TABLE_TYPE_NAMESPACE_CUSTOMIZATION)) {
String tableTypeStr = state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, KafkaSource.DEFAULT_TABLE_TYPE.toString());
tableType = Extract.TableType.valueOf(tableTypeStr);
extractNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, KafkaSource.DEFAULT_NAMESPACE_NAME);
} else {
// To be compatible, reject table type and namespace configuration keys as previous implementation
tableType = KafkaSource.DEFAULT_TABLE_TYPE;
extractNamespace = KafkaSource.DEFAULT_NAMESPACE_NAME;
}
isFullExtract = state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY);
kafkaBrokers = state.getProp(ConfigurationKeys.KAFKA_BROKERS, "");
this.shouldEnableDatasetStateStore = state.getPropAsBoolean(GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE, DEFAULT_GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE);
try {
Config config = ConfigUtils.propertiesToConfig(state.getProperties());
GobblinKafkaConsumerClientFactory kafkaConsumerClientFactory = kafkaConsumerClientResolver.resolveClass(state.getProp(GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS, DEFAULT_GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS)).newInstance();
this.kafkaConsumerClient.set(kafkaConsumerClientFactory.create(config));
List<KafkaTopic> topics = getFilteredTopics(state);
this.topicsToProcess = topics.stream().map(KafkaTopic::getName).collect(toSet());
for (String topic : this.topicsToProcess) {
LOG.info("Discovered topic " + topic);
}
Map<String, State> topicSpecificStateMap = DatasetUtils.getDatasetSpecificProps(Iterables.transform(topics, new Function<KafkaTopic, String>() {
@Override
public String apply(KafkaTopic topic) {
return topic.getName();
}
}), state);
int numOfThreads = state.getPropAsInt(ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_THREADS, ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_DEFAULT_THREAD_COUNT);
ExecutorService threadPool = Executors.newFixedThreadPool(numOfThreads, ExecutorsUtils.newThreadFactory(Optional.of(LOG)));
if (state.getPropAsBoolean(ConfigurationKeys.KAFKA_SOURCE_SHARE_CONSUMER_CLIENT, ConfigurationKeys.DEFAULT_KAFKA_SOURCE_SHARE_CONSUMER_CLIENT)) {
this.sharedKafkaConsumerClient = this.kafkaConsumerClient.get();
} else {
// preallocate one client per thread
for (int i = 0; i < numOfThreads; i++) {
kafkaConsumerClientPool.offer(kafkaConsumerClientFactory.create(config));
}
}
Stopwatch createWorkUnitStopwatch = Stopwatch.createStarted();
for (KafkaTopic topic : topics) {
threadPool.submit(new WorkUnitCreator(topic, state, Optional.fromNullable(topicSpecificStateMap.get(topic.getName())), workUnits));
}
ExecutorsUtils.shutdownExecutorService(threadPool, Optional.of(LOG), 1L, TimeUnit.HOURS);
LOG.info(String.format("Created workunits for %d topics in %d seconds", workUnits.size(), createWorkUnitStopwatch.elapsed(TimeUnit.SECONDS)));
// Create empty WorkUnits for skipped partitions (i.e., partitions that have previous offsets,
// but aren't processed).
createEmptyWorkUnitsForSkippedPartitions(workUnits, topicSpecificStateMap, state);
int numOfMultiWorkunits = state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS);
List<WorkUnit> workUnitList = KafkaWorkUnitPacker.getInstance(this, state).pack(workUnits, numOfMultiWorkunits);
addTopicSpecificPropsToWorkUnits(workUnitList, topicSpecificStateMap);
setLimiterReportKeyListToWorkUnits(workUnitList, getLimiterExtractorReportKeys());
return workUnitList;
} catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
throw new RuntimeException(e);
} finally {
try {
if (this.kafkaConsumerClient.get() != null) {
this.kafkaConsumerClient.get().close();
}
// cleanup clients from pool
for (GobblinKafkaConsumerClient client : kafkaConsumerClientPool) {
client.close();
}
} catch (IOException e) {
throw new RuntimeException("Exception closing kafkaConsumerClient");
}
}
}
use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.
the class AbstractSourceTest method testGetPreviousWorkUnitStatesOnPartialRetryPartialCommit.
/**
* Test when work unit retry policy is on partial, and the job commit policy is "partial".
*/
@Test
public void testGetPreviousWorkUnitStatesOnPartialRetryPartialCommit() {
SourceState sourceState = new SourceState(new State(), this.previousWorkUnitStates);
sourceState.setProp(ConfigurationKeys.WORK_UNIT_RETRY_POLICY_KEY, "onpartial");
sourceState.setProp(ConfigurationKeys.JOB_COMMIT_POLICY_KEY, "partial");
List<WorkUnitState> returnedWorkUnitStates = this.testSource.getPreviousWorkUnitStatesForRetry(sourceState);
Assert.assertEquals(returnedWorkUnitStates, this.expectedPreviousWorkUnitStates);
}
Aggregations