Search in sources :

Example 71 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class KafkaWorkUnitPacker method setWorkUnitEstSizes.

protected double setWorkUnitEstSizes(Map<String, List<WorkUnit>> workUnitsByTopic) {
    double totalEstDataSize = 0;
    for (List<WorkUnit> workUnitsForTopic : workUnitsByTopic.values()) {
        for (WorkUnit workUnit : workUnitsForTopic) {
            setWorkUnitEstSize(workUnit);
            totalEstDataSize += getWorkUnitEstSize(workUnit);
        }
    }
    return totalEstDataSize;
}
Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 72 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class KafkaWorkUnitPacker method squeezeMultiWorkUnit.

/**
 * Combine all {@link WorkUnit}s in the {@link MultiWorkUnit} into a single {@link WorkUnit}.
 */
protected WorkUnit squeezeMultiWorkUnit(MultiWorkUnit multiWorkUnit) {
    WatermarkInterval interval = getWatermarkIntervalFromMultiWorkUnit(multiWorkUnit);
    List<KafkaPartition> partitions = getPartitionsFromMultiWorkUnit(multiWorkUnit);
    Preconditions.checkArgument(!partitions.isEmpty(), "There must be at least one partition in the multiWorkUnit");
    // Squeeze all partitions from the multiWorkUnit into of one the work units, which can be any one
    WorkUnit workUnit = multiWorkUnit.getWorkUnits().get(0);
    // Update interval
    workUnit.removeProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY);
    workUnit.removeProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY);
    workUnit.setWatermarkInterval(interval);
    // Update offset fetch epoch time and previous latest offset. These are used to compute the load factor,
    // gobblin consumption rate relative to the kafka production rate. The kafka rate is computed as
    // (current latest offset - previous latest offset)/(current epoch time - previous epoch time).
    int index = 0;
    for (WorkUnit wu : multiWorkUnit.getWorkUnits()) {
        workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME, index), wu.getProp(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME));
        workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.OFFSET_FETCH_EPOCH_TIME, index), wu.getProp(KafkaSource.OFFSET_FETCH_EPOCH_TIME));
        workUnit.setProp(KafkaUtils.getPartitionPropName(KafkaSource.PREVIOUS_LATEST_OFFSET, index), wu.getProp(KafkaSource.PREVIOUS_LATEST_OFFSET));
        index++;
    }
    workUnit.removeProp(KafkaSource.PREVIOUS_OFFSET_FETCH_EPOCH_TIME);
    workUnit.removeProp(KafkaSource.OFFSET_FETCH_EPOCH_TIME);
    workUnit.removeProp(KafkaSource.PREVIOUS_LATEST_OFFSET);
    // Remove the original partition information
    workUnit.removeProp(KafkaSource.PARTITION_ID);
    workUnit.removeProp(KafkaSource.LEADER_ID);
    workUnit.removeProp(KafkaSource.LEADER_HOSTANDPORT);
    // Add combined partitions information
    populateMultiPartitionWorkUnit(partitions, workUnit);
    LOG.info(String.format("Created MultiWorkUnit for partitions %s", partitions));
    return workUnit;
}
Also used : WatermarkInterval(org.apache.gobblin.source.extractor.WatermarkInterval) KafkaPartition(org.apache.gobblin.source.extractor.extract.kafka.KafkaPartition) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 73 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class KafkaWorkUnitPacker method worstFitDecreasingBinPacking.

/**
 * Pack a list of {@link WorkUnit}s into a smaller number of {@link MultiWorkUnit}s,
 * using the worst-fit-decreasing algorithm.
 *
 * Each {@link WorkUnit} is assigned to the {@link MultiWorkUnit} with the smallest load.
 */
protected List<WorkUnit> worstFitDecreasingBinPacking(List<WorkUnit> groups, int numOfMultiWorkUnits) {
    // Sort workunit groups by data size desc
    Collections.sort(groups, LOAD_DESC_COMPARATOR);
    MinMaxPriorityQueue<MultiWorkUnit> pQueue = MinMaxPriorityQueue.orderedBy(LOAD_ASC_COMPARATOR).expectedSize(numOfMultiWorkUnits).create();
    for (int i = 0; i < numOfMultiWorkUnits; i++) {
        MultiWorkUnit multiWorkUnit = MultiWorkUnit.createEmpty();
        setWorkUnitEstSize(multiWorkUnit, 0);
        pQueue.add(multiWorkUnit);
    }
    for (WorkUnit group : groups) {
        MultiWorkUnit lightestMultiWorkUnit = pQueue.poll();
        addWorkUnitToMultiWorkUnit(group, lightestMultiWorkUnit);
        pQueue.add(lightestMultiWorkUnit);
    }
    logMultiWorkUnitInfo(pQueue);
    double minLoad = getWorkUnitEstLoad(pQueue.peekFirst());
    double maxLoad = getWorkUnitEstLoad(pQueue.peekLast());
    LOG.info(String.format("Min load of multiWorkUnit = %f; Max load of multiWorkUnit = %f; Diff = %f%%", minLoad, maxLoad, (maxLoad - minLoad) / maxLoad * 100.0));
    this.state.setProp(MIN_MULTIWORKUNIT_LOAD, minLoad);
    this.state.setProp(MAX_MULTIWORKUNIT_LOAD, maxLoad);
    List<WorkUnit> multiWorkUnits = Lists.newArrayList();
    multiWorkUnits.addAll(pQueue);
    return multiWorkUnits;
}
Also used : MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 74 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class KafkaSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    this.metricContext = Instrumented.getMetricContext(state, KafkaSource.class);
    this.lineageInfo = LineageInfo.getLineageInfo(state.getBroker());
    Map<String, List<WorkUnit>> workUnits = Maps.newConcurrentMap();
    if (state.getPropAsBoolean(KafkaSource.GOBBLIN_KAFKA_EXTRACT_ALLOW_TABLE_TYPE_NAMESPACE_CUSTOMIZATION)) {
        String tableTypeStr = state.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, KafkaSource.DEFAULT_TABLE_TYPE.toString());
        tableType = Extract.TableType.valueOf(tableTypeStr);
        extractNamespace = state.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, KafkaSource.DEFAULT_NAMESPACE_NAME);
    } else {
        // To be compatible, reject table type and namespace configuration keys as previous implementation
        tableType = KafkaSource.DEFAULT_TABLE_TYPE;
        extractNamespace = KafkaSource.DEFAULT_NAMESPACE_NAME;
    }
    isFullExtract = state.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY);
    kafkaBrokers = state.getProp(ConfigurationKeys.KAFKA_BROKERS, "");
    this.shouldEnableDatasetStateStore = state.getPropAsBoolean(GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE, DEFAULT_GOBBLIN_KAFKA_SHOULD_ENABLE_DATASET_STATESTORE);
    try {
        Config config = ConfigUtils.propertiesToConfig(state.getProperties());
        GobblinKafkaConsumerClientFactory kafkaConsumerClientFactory = kafkaConsumerClientResolver.resolveClass(state.getProp(GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS, DEFAULT_GOBBLIN_KAFKA_CONSUMER_CLIENT_FACTORY_CLASS)).newInstance();
        this.kafkaConsumerClient.set(kafkaConsumerClientFactory.create(config));
        List<KafkaTopic> topics = getFilteredTopics(state);
        this.topicsToProcess = topics.stream().map(KafkaTopic::getName).collect(toSet());
        for (String topic : this.topicsToProcess) {
            LOG.info("Discovered topic " + topic);
        }
        Map<String, State> topicSpecificStateMap = DatasetUtils.getDatasetSpecificProps(Iterables.transform(topics, new Function<KafkaTopic, String>() {

            @Override
            public String apply(KafkaTopic topic) {
                return topic.getName();
            }
        }), state);
        int numOfThreads = state.getPropAsInt(ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_THREADS, ConfigurationKeys.KAFKA_SOURCE_WORK_UNITS_CREATION_DEFAULT_THREAD_COUNT);
        ExecutorService threadPool = Executors.newFixedThreadPool(numOfThreads, ExecutorsUtils.newThreadFactory(Optional.of(LOG)));
        if (state.getPropAsBoolean(ConfigurationKeys.KAFKA_SOURCE_SHARE_CONSUMER_CLIENT, ConfigurationKeys.DEFAULT_KAFKA_SOURCE_SHARE_CONSUMER_CLIENT)) {
            this.sharedKafkaConsumerClient = this.kafkaConsumerClient.get();
        } else {
            // preallocate one client per thread
            for (int i = 0; i < numOfThreads; i++) {
                kafkaConsumerClientPool.offer(kafkaConsumerClientFactory.create(config));
            }
        }
        Stopwatch createWorkUnitStopwatch = Stopwatch.createStarted();
        for (KafkaTopic topic : topics) {
            threadPool.submit(new WorkUnitCreator(topic, state, Optional.fromNullable(topicSpecificStateMap.get(topic.getName())), workUnits));
        }
        ExecutorsUtils.shutdownExecutorService(threadPool, Optional.of(LOG), 1L, TimeUnit.HOURS);
        LOG.info(String.format("Created workunits for %d topics in %d seconds", workUnits.size(), createWorkUnitStopwatch.elapsed(TimeUnit.SECONDS)));
        // Create empty WorkUnits for skipped partitions (i.e., partitions that have previous offsets,
        // but aren't processed).
        createEmptyWorkUnitsForSkippedPartitions(workUnits, topicSpecificStateMap, state);
        int numOfMultiWorkunits = state.getPropAsInt(ConfigurationKeys.MR_JOB_MAX_MAPPERS_KEY, ConfigurationKeys.DEFAULT_MR_JOB_MAX_MAPPERS);
        List<WorkUnit> workUnitList = KafkaWorkUnitPacker.getInstance(this, state).pack(workUnits, numOfMultiWorkunits);
        addTopicSpecificPropsToWorkUnits(workUnitList, topicSpecificStateMap);
        setLimiterReportKeyListToWorkUnits(workUnitList, getLimiterExtractorReportKeys());
        return workUnitList;
    } catch (InstantiationException | IllegalAccessException | ClassNotFoundException e) {
        throw new RuntimeException(e);
    } finally {
        try {
            if (this.kafkaConsumerClient.get() != null) {
                this.kafkaConsumerClient.get().close();
            }
            // cleanup clients from pool
            for (GobblinKafkaConsumerClient client : kafkaConsumerClientPool) {
                client.close();
            }
        } catch (IOException e) {
            throw new RuntimeException("Exception closing kafkaConsumerClient");
        }
    }
}
Also used : Config(com.typesafe.config.Config) Stopwatch(com.google.common.base.Stopwatch) IOException(java.io.IOException) GobblinKafkaConsumerClient(org.apache.gobblin.kafka.client.GobblinKafkaConsumerClient) Function(com.google.common.base.Function) State(org.apache.gobblin.configuration.State) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) SourceState(org.apache.gobblin.configuration.SourceState) ExecutorService(java.util.concurrent.ExecutorService) List(java.util.List) ArrayList(java.util.ArrayList) Collectors.toList(java.util.stream.Collectors.toList) GobblinKafkaConsumerClientFactory(org.apache.gobblin.kafka.client.GobblinKafkaConsumerClient.GobblinKafkaConsumerClientFactory) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 75 with WorkUnit

use of org.apache.gobblin.source.workunit.WorkUnit in project incubator-gobblin by apache.

the class TestAvroSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    WorkUnit workUnit = WorkUnit.createEmpty();
    workUnit.addAll(state);
    return Collections.singletonList(workUnit);
}
Also used : WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Aggregations

WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)133 Test (org.testng.annotations.Test)59 SourceState (org.apache.gobblin.configuration.SourceState)40 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)40 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)35 Extract (org.apache.gobblin.source.workunit.Extract)24 Path (org.apache.hadoop.fs.Path)19 State (org.apache.gobblin.configuration.State)13 IOException (java.io.IOException)11 ArrayList (java.util.ArrayList)10 Closer (com.google.common.io.Closer)9 Properties (java.util.Properties)9 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)8 List (java.util.List)7 Table (org.apache.hadoop.hive.ql.metadata.Table)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Config (com.typesafe.config.Config)6 File (java.io.File)6 IterableDatasetFinder (org.apache.gobblin.dataset.IterableDatasetFinder)6 WorkUnitStream (org.apache.gobblin.source.workunit.WorkUnitStream)6