Search in sources :

Example 6 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class KafkaSimpleStreamingSource method getWorkunits.

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
    Config config = ConfigUtils.propertiesToConfig(state.getProperties());
    Consumer<String, byte[]> consumer = getKafkaConsumer(config);
    LOG.debug("Consumer is {}", consumer);
    String topic = ConfigUtils.getString(config, TOPIC_WHITELIST, // TODO: fix this to use the new API when KafkaWrapper is fixed
    StringUtils.EMPTY);
    List<WorkUnit> workUnits = new ArrayList<WorkUnit>();
    List<PartitionInfo> topicPartitions;
    topicPartitions = consumer.partitionsFor(topic);
    LOG.info("Partition count is {}", topicPartitions.size());
    for (PartitionInfo topicPartition : topicPartitions) {
        Extract extract = this.createExtract(DEFAULT_TABLE_TYPE, DEFAULT_NAMESPACE_NAME, topicPartition.topic());
        LOG.info("Partition info is {}", topicPartition);
        WorkUnit workUnit = WorkUnit.create(extract);
        setTopicNameInState(workUnit, topicPartition.topic());
        workUnit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, topicPartition.topic());
        setPartitionId(workUnit, topicPartition.partition());
        workUnits.add(workUnit);
    }
    return workUnits;
}
Also used : Config(com.typesafe.config.Config) ArrayList(java.util.ArrayList) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) PartitionInfo(org.apache.kafka.common.PartitionInfo)

Example 7 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class PartitionedFileSourceBase method getExtractForFile.

private Extract getExtractForFile(PartitionAwareFileRetriever.FileInfo file, String topicName, String namespace, Map<Long, Extract> extractMap) {
    Extract extract = extractMap.get(file.getWatermarkMsSinceEpoch());
    if (extract == null) {
        // Create an extract object for the dayPath
        extract = new Extract(this.tableType, namespace, topicName);
        LOG.info("Created extract: " + extract.getExtractId() + " for path " + topicName);
        extractMap.put(file.getWatermarkMsSinceEpoch(), extract);
    }
    return extract;
}
Also used : Extract(org.apache.gobblin.source.workunit.Extract)

Example 8 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class CopySourceTest method testCopySource.

@Test
public void testCopySource() throws Exception {
    SourceState state = new SourceState();
    state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
    state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///");
    state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir");
    state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, TestCopyableDatasetFinder.class.getName());
    CopySource source = new CopySource();
    List<WorkUnit> workunits = source.getWorkunits(state);
    workunits = JobLauncherUtils.flattenWorkUnits(workunits);
    Assert.assertEquals(workunits.size(), TestCopyableDataset.FILE_COUNT);
    Extract extract = workunits.get(0).getExtract();
    for (WorkUnit workUnit : workunits) {
        CopyableFile file = (CopyableFile) CopySource.deserializeCopyEntity(workUnit);
        Assert.assertTrue(file.getOrigin().getPath().toString().startsWith(TestCopyableDataset.ORIGIN_PREFIX));
        Assert.assertEquals(file.getDestinationOwnerAndPermission(), TestCopyableDataset.OWNER_AND_PERMISSION);
        Assert.assertEquals(workUnit.getExtract(), extract);
    }
}
Also used : SourceState(org.apache.gobblin.configuration.SourceState) Extract(org.apache.gobblin.source.workunit.Extract) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit) Test(org.testng.annotations.Test)

Example 9 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class KafkaSource method getWorkUnitForTopicPartition.

private WorkUnit getWorkUnitForTopicPartition(KafkaPartition partition, Offsets offsets, Optional<State> topicSpecificState) {
    // Default to job level configurations
    Extract.TableType currentTableType = tableType;
    String currentExtractNamespace = extractNamespace;
    String currentExtractTableName = partition.getTopicName();
    boolean isCurrentFullExtract = isFullExtract;
    // Update to topic specific configurations if any
    if (topicSpecificState.isPresent()) {
        State topicState = topicSpecificState.get();
        if (topicState.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)) {
            currentTableType = Extract.TableType.valueOf(topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY));
        }
        currentExtractNamespace = topicState.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, extractNamespace);
        currentExtractTableName = topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, partition.getTopicName());
        isCurrentFullExtract = topicState.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY, isFullExtract);
    }
    Extract extract = this.createExtract(currentTableType, currentExtractNamespace, currentExtractTableName);
    if (isCurrentFullExtract) {
        extract.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, true);
    }
    WorkUnit workUnit = WorkUnit.create(extract);
    workUnit.setProp(TOPIC_NAME, partition.getTopicName());
    addDatasetUrnOptionally(workUnit);
    workUnit.setProp(PARTITION_ID, partition.getId());
    workUnit.setProp(LEADER_ID, partition.getLeader().getId());
    workUnit.setProp(LEADER_HOSTANDPORT, partition.getLeader().getHostAndPort().toString());
    workUnit.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, offsets.getStartOffset());
    workUnit.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, offsets.getLatestOffset());
    workUnit.setProp(PREVIOUS_OFFSET_FETCH_EPOCH_TIME, offsets.getPreviousOffsetFetchEpochTime());
    workUnit.setProp(OFFSET_FETCH_EPOCH_TIME, offsets.getOffsetFetchEpochTime());
    workUnit.setProp(PREVIOUS_LATEST_OFFSET, offsets.getPreviousLatestOffset());
    // Add lineage info
    DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_KAFKA, partition.getTopicName());
    source.addMetadata(DatasetConstants.BROKERS, kafkaBrokers);
    if (this.lineageInfo.isPresent()) {
        this.lineageInfo.get().setSource(source, workUnit);
    }
    LOG.info(String.format("Created workunit for partition %s: lowWatermark=%d, highWatermark=%d, range=%d", partition, offsets.getStartOffset(), offsets.getLatestOffset(), offsets.getLatestOffset() - offsets.getStartOffset()));
    return workUnit;
}
Also used : DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) State(org.apache.gobblin.configuration.State) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) SourceState(org.apache.gobblin.configuration.SourceState) Extract(org.apache.gobblin.source.workunit.Extract) MultiWorkUnit(org.apache.gobblin.source.workunit.MultiWorkUnit) WorkUnit(org.apache.gobblin.source.workunit.WorkUnit)

Example 10 with Extract

use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.

the class TaskState method toTaskExecutionInfo.

/**
 * Convert this {@link TaskState} instance to a {@link TaskExecutionInfo} instance.
 *
 * @return a {@link TaskExecutionInfo} instance
 */
public TaskExecutionInfo toTaskExecutionInfo() {
    TaskExecutionInfo taskExecutionInfo = new TaskExecutionInfo();
    taskExecutionInfo.setJobId(this.jobId);
    taskExecutionInfo.setTaskId(this.taskId);
    if (this.startTime > 0) {
        taskExecutionInfo.setStartTime(this.startTime);
    }
    if (this.endTime > 0) {
        taskExecutionInfo.setEndTime(this.endTime);
    }
    taskExecutionInfo.setDuration(this.duration);
    taskExecutionInfo.setState(TaskStateEnum.valueOf(getWorkingState().name()));
    if (this.contains(ConfigurationKeys.TASK_FAILURE_EXCEPTION_KEY)) {
        taskExecutionInfo.setFailureException(this.getProp(ConfigurationKeys.TASK_FAILURE_EXCEPTION_KEY));
    }
    taskExecutionInfo.setHighWatermark(this.getHighWaterMark());
    // Add extract/table information
    Table table = new Table();
    Extract extract = this.getExtract();
    table.setNamespace(extract.getNamespace());
    table.setName(extract.getTable());
    if (extract.hasType()) {
        table.setType(TableTypeEnum.valueOf(extract.getType().name()));
    }
    taskExecutionInfo.setTable(table);
    // Add task metrics
    TaskMetrics taskMetrics = TaskMetrics.get(this);
    MetricArray metricArray = new MetricArray();
    for (Map.Entry<String, ? extends com.codahale.metrics.Metric> entry : taskMetrics.getMetricContext().getCounters().entrySet()) {
        Metric counter = new Metric();
        counter.setGroup(MetricGroup.TASK.name());
        counter.setName(entry.getKey());
        counter.setType(MetricTypeEnum.valueOf(GobblinMetrics.MetricType.COUNTER.name()));
        counter.setValue(Long.toString(((Counter) entry.getValue()).getCount()));
        metricArray.add(counter);
    }
    for (Map.Entry<String, ? extends com.codahale.metrics.Metric> entry : taskMetrics.getMetricContext().getMeters().entrySet()) {
        Metric meter = new Metric();
        meter.setGroup(MetricGroup.TASK.name());
        meter.setName(entry.getKey());
        meter.setType(MetricTypeEnum.valueOf(GobblinMetrics.MetricType.METER.name()));
        meter.setValue(Double.toString(((Meter) entry.getValue()).getMeanRate()));
        metricArray.add(meter);
    }
    for (Map.Entry<String, ? extends com.codahale.metrics.Metric> entry : taskMetrics.getMetricContext().getGauges().entrySet()) {
        Metric gauge = new Metric();
        gauge.setGroup(MetricGroup.TASK.name());
        gauge.setName(entry.getKey());
        gauge.setType(MetricTypeEnum.valueOf(GobblinMetrics.MetricType.GAUGE.name()));
        gauge.setValue(((Gauge<?>) entry.getValue()).getValue().toString());
        metricArray.add(gauge);
    }
    taskExecutionInfo.setMetrics(metricArray);
    // Add task properties
    Map<String, String> taskProperties = Maps.newHashMap();
    for (String name : this.getPropertyNames()) {
        String value = this.getProp(name);
        if (!Strings.isNullOrEmpty(value))
            taskProperties.put(name, value);
    }
    taskExecutionInfo.setTaskProperties(new StringMap(taskProperties));
    return taskExecutionInfo;
}
Also used : StringMap(com.linkedin.data.template.StringMap) Table(org.apache.gobblin.rest.Table) Meter(com.codahale.metrics.Meter) Extract(org.apache.gobblin.source.workunit.Extract) Gauge(com.codahale.metrics.Gauge) Counter(com.codahale.metrics.Counter) TaskExecutionInfo(org.apache.gobblin.rest.TaskExecutionInfo) TaskMetrics(org.apache.gobblin.runtime.util.TaskMetrics) MetricArray(org.apache.gobblin.rest.MetricArray) Metric(org.apache.gobblin.rest.Metric) Map(java.util.Map) StringMap(com.linkedin.data.template.StringMap)

Aggregations

Extract (org.apache.gobblin.source.workunit.Extract)29 WorkUnit (org.apache.gobblin.source.workunit.WorkUnit)24 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)11 SourceState (org.apache.gobblin.configuration.SourceState)8 Test (org.testng.annotations.Test)7 Path (org.apache.hadoop.fs.Path)6 MultiWorkUnit (org.apache.gobblin.source.workunit.MultiWorkUnit)4 IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 Configuration (org.apache.hadoop.conf.Configuration)3 Gson (com.google.gson.Gson)2 JsonObject (com.google.gson.JsonObject)2 Config (com.typesafe.config.Config)2 InputStreamReader (java.io.InputStreamReader)2 Type (java.lang.reflect.Type)2 Map (java.util.Map)2 State (org.apache.gobblin.configuration.State)2 WatermarkInterval (org.apache.gobblin.source.extractor.WatermarkInterval)2 LongWatermark (org.apache.gobblin.source.extractor.extract.LongWatermark)2 TableType (org.apache.gobblin.source.workunit.Extract.TableType)2