Search in sources :

Example 51 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class PartitionerTest method testGetLowWatermarkOnSnapshotExtract.

/**
 * Test getLowWatermark. Extract type: Snapshot.
 */
@Test
public void testGetLowWatermarkOnSnapshotExtract() {
    SourceState sourceState = new SourceState();
    String startValue = "20140101000000";
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_START_VALUE, startValue);
    TestPartitioner partitioner = new TestPartitioner(sourceState);
    ExtractType extractType = ExtractType.SNAPSHOT;
    int delta = 1;
    // No previous watermark
    Assert.assertEquals(partitioner.getLowWatermark(extractType, null, ConfigurationKeys.DEFAULT_WATERMARK_VALUE, delta), Long.parseLong(startValue), "Low watermark should be " + startValue);
    // With previous watermark
    long previousWatermark = 20140101000050L;
    long expected = previousWatermark + delta;
    Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.SIMPLE, previousWatermark, delta), expected, "Low watermark should be " + expected);
    Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.TIMESTAMP, previousWatermark, delta), expected, "Low watermark should be " + expected);
    // With SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS
    int backupSecs = 10;
    expected = previousWatermark + delta - backupSecs;
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_LOW_WATERMARK_BACKUP_SECS, backupSecs);
    Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.SIMPLE, previousWatermark, delta), expected, "Low watermark should be " + expected);
    Assert.assertEquals(partitioner.getLowWatermark(extractType, WatermarkType.TIMESTAMP, previousWatermark, delta), expected, "Low watermark should be " + expected);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) ExtractType(org.apache.gobblin.source.extractor.extract.ExtractType) Test(org.testng.annotations.Test)

Example 52 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class PartitionerTest method testGetUserSpecifiedPartitionList.

@Test
public void testGetUserSpecifiedPartitionList() {
    List<Partition> expectedPartitions = new ArrayList<>();
    SourceState sourceState = new SourceState();
    sourceState.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true);
    TestPartitioner partitioner = new TestPartitioner(sourceState);
    long defaultValue = ConfigurationKeys.DEFAULT_WATERMARK_VALUE;
    expectedPartitions.add(new Partition(defaultValue, defaultValue, true, true));
    sourceState.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, "");
    // Partition list doesn't exist
    Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
    // Date partitions
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "date");
    // Only one partition point
    sourceState.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, "20140101030201");
    expectedPartitions.clear();
    expectedPartitions.add(new Partition(20140101000000L, 20170101000000L, true, false));
    Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
    // Keep upper bounds for append_daily job
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "APPEND_DAILY");
    sourceState.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, "20140101030201, 20140102040201");
    expectedPartitions.clear();
    expectedPartitions.add(new Partition(20140101000000L, 20140102000000L, true, true));
    Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
    // Hour partitions, snapshot extract
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "hour");
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT");
    expectedPartitions.clear();
    expectedPartitions.add(new Partition(20140101030000L, 20140102040000L, true, false));
    Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
    // Hour partitions, timestamp extract
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_WATERMARK_TYPE, "timestamp");
    expectedPartitions.clear();
    expectedPartitions.add(new Partition(20140101030201L, 20140102040201L, true, false));
    Assert.assertEquals(partitioner.getPartitionList(-1), expectedPartitions);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) ArrayList(java.util.ArrayList) Test(org.testng.annotations.Test)

Example 53 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class PartitionerTest method testGetHighWatermarkOnUserOverride.

/**
 * Test getHighWatermark. Is watermark override: true.
 */
@Test
public void testGetHighWatermarkOnUserOverride() {
    String endValue = "20140101000000";
    SourceState sourceState = new SourceState();
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_IS_WATERMARK_OVERRIDE, true);
    sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_END_VALUE, endValue);
    TestPartitioner partitioner = new TestPartitioner(sourceState);
    Assert.assertEquals(partitioner.getHighWatermark(null, null), Long.parseLong(endValue), "High watermark should be " + endValue);
    Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), true, "Should mark as user specified high watermark");
    partitioner.reset();
    // Should return current time if no SOURCE_QUERYBASED_END_VALUE is specified
    sourceState.removeProp(ConfigurationKeys.SOURCE_QUERYBASED_END_VALUE);
    long expected = Long.parseLong(TestPartitioner.currentTimeString);
    Assert.assertEquals(partitioner.getHighWatermark(null, null), expected, "High watermark should be " + expected);
    Assert.assertEquals(partitioner.getUserSpecifiedHighWatermark(), false, "Should not mark as user specified high watermark");
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Test(org.testng.annotations.Test)

Example 54 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class KafkaSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    this.metricContext = Instrumented.getMetricContext(state, KafkaSource.class);
    this.lineageInfo = LineageInfo.getLineageInfo(state.getBroker());
    Map<String, List<WorkUnit>> workUnits = Maps.newConcurrentMap();
    if (state.getPropAsBoolean(KafkaSource.GOBBLIN_KAFKA_EXTRACT_ALLOW_TABLE_TYPE_NAMESPACE_CUSTOMIZATION)) {
        String tableTypeStr = state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, KafkaSource.DEFAULT_TABLE_TYPE.toString());
        tableType = Extract.TableType.valueOf(tableTypeStr);
        extractNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, KafkaSource.DEFAULT_NAMESPACE_NAME);
    } else {
        // To be compatible, reject table type and namespace configuration keys as previous implementation
        tableType = KafkaSource.DEFAULT_TABLE_TYPE;
        extractNamespace = KafkaSource.DEFAULT_NAMESPACE_NAME;
    }
    isFullExtract = state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY);
    kafkaBrokers = state.getProp(ConfigurationKeys.KAFKA_BROKERS, "");
    this.shouldEnableDatasetStateStore = state.getPropAsBoolean(GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE, DEFAULT_GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE);
    try {
        Config config = ConfigUtils.propertiesToConfig(state.getProperties());
        GobblinKafkaConsumerClientFactory kafkaConsumerClientFactory = kafkaConsumerClientResolver.resolveClass(state.getProp(GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS, DEFAULT_GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS)).newInstance();
        this.kafkaConsumerClient.set(kafkaConsumerClientFactory.create(config));
        List<KafkaTopic> topics = getFilteredTopics(state);
        this.topicsToProcess = topics.stream().map(KafkaTopic::getName).collect(toSet());
        for (String topic : this.topicsToProcess) {
            LOG.info("Discovered topic " + topic);
        }
        Map<String, State> topicSpecificStateMap = DatasetUtils.getDatasetSpecificProps(Iterables.transform(topics, new Function<KafkaTopic, String>() {

            @Override
            public String apply(KafkaTopic topic) {
                return topic.getName();
            }
        }), state);
        int numOfThreads = state.getPropAsInt(ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_THREADS, ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_DEFAULT_THREAD_COUNT);
        ExecutorService threadPool = Executors.newFixedThreadPool(numOfThreads, ExecutorsUtils.newThreadFactory(Optional.of(LOG)));
        if (state.getPropAsBoolean(ConfigurationKeys.KAFKA_SOURCE_SHARE_CONSUMER_CLIENT, ConfigurationKeys.DEFAULT_KAFKA_SOURCE_SHARE_CONSUMER_CLIENT)) {
            this.sharedKafkaConsumerClient = this.kafkaConsumerClient.get();
        } else {
            // preallocate one client per thread
            for (int i = 0; i < numOfThreads; i++) {
                kafkaConsumerClientPool.offer(kafkaConsumerClientFactory.create(config));
            }
        }
        Stopwatch createWorkUnitStopwatch = Stopwatch.createStarted();
        for (KafkaTopic topic : topics) {
            threadPool.submit(new WorkUnitCreator(topic, state, Optional.fromNullable(topicSpecificStateMap.get(topic.getName())), workUnits));
        }
        ExecutorsUtils.shutdownExecutorService(threadPool, Optional.of(LOG), 1L, TimeUnit.HOURS);
        LOG.info(String.format("Created workunits for %d topics in %d seconds", workUnits.size(), createWorkUnitStopwatch.elapsed(TimeUnit.SECONDS)));
        // Create empty WorkUnits for skipped partitions (i.e., partitions that have previous offsets,
        // but aren't processed).
        createEmptyWorkUnitsForSkippedPartitions(workUnits, topicSpecificStateMap, state);
        int numOfMultiWorkunits = state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS);
        List<WorkUnit> workUnitList = KafkaWorkUnitPacker.getInstance(this, state).pack(workUnits, numOfMultiWorkunits);
        addTopicSpecificPropsToWorkUnits(workUnitList, topicSpecificStateMap);
        setLimiterReportKeyListToWorkUnits(workUnitList, getLimiterExtractorReportKeys());
        return workUnitList;
    } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
        throw new RuntimeException(e);
    } finally {
        try {
            if (this.kafkaConsumerClient.get() != null) {
                this.kafkaConsumerClient.get().close();
            }
            // cleanup clients from pool
            for (GobblinKafkaConsumerClient client : kafkaConsumerClientPool) {
                client.close();
            }
        } catch (IOException e) {
            throw new RuntimeException("Exception closing kafkaConsumerClient");
        }
    }
}
Also used : Config(com.typesafe.config.Config) Stopwatch(com.google.common.base.Stopwatch) IOException(java.io.IOException) GobblinKafkaConsumerClient(org.apache.gobblin.kafka.client.GobblinKafkaConsumerClient) Function(com.google.common.base.Function) State(org.apache.gobblin.configuration.State) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) SourceState(org.apache.gobblin.configuration.SourceState) ExecutorService(java.util.concurrent.ExecutorService) List(java.util.List) ArrayList(java.util.ArrayList) Collectors.toList(java.util.stream.Collectors.toList) GobblinKafkaConsumerClientFactory(org.apache.gobblin.kafka.client.GobblinKafkaConsumerClient.GobblinKafkaConsumerClientFactory) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 55 with SourceState

use of org.apache.gobblin.configuration.SourceState in project incubator-gobblin by apache.

the class AbstractSourceTest method testGetPreviousWorkUnitStatesOnPartialRetryPartialCommit.

/**
 * Test when work unit retry policy is on partial, and the job commit policy is "partial".
 */
@Test
public void testGetPreviousWorkUnitStatesOnPartialRetryPartialCommit() {
    SourceState sourceState = new SourceState(new State(), this.previousWorkUnitStates);
    sourceState.setProp(ConfigurationKeys.WORK_UNIT_RETRY_POLICY_KEY, "onpartial");
    sourceState.setProp(ConfigurationKeys.JOB_COMMIT_POLICY_KEY, "partial");
    List<WorkUnitState> returnedWorkUnitStates = this.testSource.getPreviousWorkUnitStatesForRetry(sourceState);
    Assert.assertEquals(returnedWorkUnitStates, this.expectedPreviousWorkUnitStates);
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) State(org.apache.gobblin.configuration.State) WorkingState(org.apache.gobblin.configuration.WorkUnitState.WorkingState) SourceState(org.apache.gobblin.configuration.SourceState) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) Test(org.testng.annotations.Test)

Aggregations

SourceState (org.apache.gobblin.configuration.SourceState)90 Test (org.testng.annotations.Test)76 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)44 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)38 State (org.apache.gobblin.configuration.State)30 WorkingState (org.apache.gobblin.configuration.WorkUnitState.WorkingState)11 Partition (org.apache.hadoop.hive.ql.metadata.Partition)8 Table (org.apache.hadoop.hive.ql.metadata.Table)8 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)7 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)7 Extract (org.apache.gobblin.source.workunit.Extract)7 DateTime (org.joda.time.DateTime)7 Dataset (org.apache.gobblin.dataset.Dataset)6 PartitionableDataset (org.apache.gobblin.dataset.PartitionableDataset)6 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6 IOException (java.io.IOException)5 Path (org.apache.hadoop.fs.Path)5 Gson (com.google.gson.Gson)4 JsonObject (com.google.gson.JsonObject)4