Search in sources :

Example 1 with GobblinKafkaConsumerClientFactory

use of org.apache.gobblin.kafka.client.GobblinKafkaConsumerClient.GobblinKafkaConsumerClientFactory in project incubator-gobblin by apache.

the class KafkaSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    this.metricContext = Instrumented.getMetricContext(state, KafkaSource.class);
    this.lineageInfo = LineageInfo.getLineageInfo(state.getBroker());
    Map<String, List<WorkUnit>> workUnits = Maps.newConcurrentMap();
    if (state.getPropAsBoolean(KafkaSource.GOBBLIN_KAFKA_EXTRACT_ALLOW_TABLE_TYPE_NAMESPACE_CUSTOMIZATION)) {
        String tableTypeStr = state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, KafkaSource.DEFAULT_TABLE_TYPE.toString());
        tableType = Extract.TableType.valueOf(tableTypeStr);
        extractNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, KafkaSource.DEFAULT_NAMESPACE_NAME);
    } else {
        // To be compatible, reject table type and namespace configuration keys as previous implementation
        tableType = KafkaSource.DEFAULT_TABLE_TYPE;
        extractNamespace = KafkaSource.DEFAULT_NAMESPACE_NAME;
    }
    isFullExtract = state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY);
    kafkaBrokers = state.getProp(ConfigurationKeys.KAFKA_BROKERS, "");
    this.shouldEnableDatasetStateStore = state.getPropAsBoolean(GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE, DEFAULT_GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE);
    try {
        Config config = ConfigUtils.propertiesToConfig(state.getProperties());
        GobblinKafkaConsumerClientFactory kafkaConsumerClientFactory = kafkaConsumerClientResolver.resolveClass(state.getProp(GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS, DEFAULT_GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS)).newInstance();
        this.kafkaConsumerClient.set(kafkaConsumerClientFactory.create(config));
        List<KafkaTopic> topics = getFilteredTopics(state);
        this.topicsToProcess = topics.stream().map(KafkaTopic::getName).collect(toSet());
        for (String topic : this.topicsToProcess) {
            LOG.info("Discovered topic " + topic);
        }
        Map<String, State> topicSpecificStateMap = DatasetUtils.getDatasetSpecificProps(Iterables.transform(topics, new Function<KafkaTopic, String>() {

            @Override
            public String apply(KafkaTopic topic) {
                return topic.getName();
            }
        }), state);
        int numOfThreads = state.getPropAsInt(ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_THREADS, ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_DEFAULT_THREAD_COUNT);
        ExecutorService threadPool = Executors.newFixedThreadPool(numOfThreads, ExecutorsUtils.newThreadFactory(Optional.of(LOG)));
        if (state.getPropAsBoolean(ConfigurationKeys.KAFKA_SOURCE_SHARE_CONSUMER_CLIENT, ConfigurationKeys.DEFAULT_KAFKA_SOURCE_SHARE_CONSUMER_CLIENT)) {
            this.sharedKafkaConsumerClient = this.kafkaConsumerClient.get();
        } else {
            // preallocate one client per thread
            for (int i = 0; i < numOfThreads; i++) {
                kafkaConsumerClientPool.offer(kafkaConsumerClientFactory.create(config));
            }
        }
        Stopwatch createWorkUnitStopwatch = Stopwatch.createStarted();
        for (KafkaTopic topic : topics) {
            threadPool.submit(new WorkUnitCreator(topic, state, Optional.fromNullable(topicSpecificStateMap.get(topic.getName())), workUnits));
        }
        ExecutorsUtils.shutdownExecutorService(threadPool, Optional.of(LOG), 1L, TimeUnit.HOURS);
        LOG.info(String.format("Created workunits for %d topics in %d seconds", workUnits.size(), createWorkUnitStopwatch.elapsed(TimeUnit.SECONDS)));
        // Create empty WorkUnits for skipped partitions (i.e., partitions that have previous offsets,
        // but aren't processed).
        createEmptyWorkUnitsForSkippedPartitions(workUnits, topicSpecificStateMap, state);
        int numOfMultiWorkunits = state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS);
        List<WorkUnit> workUnitList = KafkaWorkUnitPacker.getInstance(this, state).pack(workUnits, numOfMultiWorkunits);
        addTopicSpecificPropsToWorkUnits(workUnitList, topicSpecificStateMap);
        setLimiterReportKeyListToWorkUnits(workUnitList, getLimiterExtractorReportKeys());
        return workUnitList;
    } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
        throw new RuntimeException(e);
    } finally {
        try {
            if (this.kafkaConsumerClient.get() != null) {
                this.kafkaConsumerClient.get().close();
            }
            // cleanup clients from pool
            for (GobblinKafkaConsumerClient client : kafkaConsumerClientPool) {
                client.close();
            }
        } catch (IOException e) {
            throw new RuntimeException("Exception closing kafkaConsumerClient");
        }
    }
}
Also used : Config(com.typesafe.config.Config) Stopwatch(com.google.common.base.Stopwatch) IOException(java.io.IOException) GobblinKafkaConsumerClient(org.apache.gobblin.kafka.client.GobblinKafkaConsumerClient) Function(com.google.common.base.Function) State(org.apache.gobblin.configuration.State) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) SourceState(org.apache.gobblin.configuration.SourceState) ExecutorService(java.util.concurrent.ExecutorService) List(java.util.List) ArrayList(java.util.ArrayList) Collectors.toList(java.util.stream.Collectors.toList) GobblinKafkaConsumerClientFactory(org.apache.gobblin.kafka.client.GobblinKafkaConsumerClient.GobblinKafkaConsumerClientFactory) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Aggregations

Function (com.google.common.base.Function)1 Stopwatch (com.google.common.base.Stopwatch)1 Config (com.typesafe.config.Config)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 ExecutorService (java.util.concurrent.ExecutorService)1 Collectors.toList (java.util.stream.Collectors.toList)1 SourceState (org.apache.gobblin.configuration.SourceState)1 State (org.apache.gobblin.configuration.State)1 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)1 GobblinKafkaConsumerClient (org.apache.gobblin.kafka.client.GobblinKafkaConsumerClient)1 GobblinKafkaConsumerClientFactory (org.apache.gobblin.kafka.client.GobblinKafkaConsumerClient.GobblinKafkaConsumerClientFactory)1 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)1 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)1