use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.
the class KafkaSimpleStreamingSource method getWorkunits.
@Override
public List<WorkUnit> getWorkunits(SourceState state) {
Config config = ConfigUtils.propertiesToConfig(state.getProperties());
Consumer<String, byte[]> consumer = getKafkaConsumer(config);
LOG.debug("Consumer is {}", consumer);
String topic = ConfigUtils.getString(config, TOPIC_WHITELIST, // TODO: fix this to use the new API when KafkaWrapper is fixed
StringUtils.EMPTY);
List<WorkUnit> workUnits = new ArrayList<WorkUnit>();
List<PartitionInfo> topicPartitions;
topicPartitions = consumer.partitionsFor(topic);
LOG.info("Partition count is {}", topicPartitions.size());
for (PartitionInfo topicPartition : topicPartitions) {
Extract extract = this.createExtract(DEFAULT_TABLE_TYPE, DEFAULT_NAMESPACE_NAME, topicPartition.topic());
LOG.info("Partition info is {}", topicPartition);
WorkUnit workUnit = WorkUnit.create(extract);
setTopicNameInState(workUnit, topicPartition.topic());
workUnit.setProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, topicPartition.topic());
setPartitionId(workUnit, topicPartition.partition());
workUnits.add(workUnit);
}
return workUnits;
}
use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.
the class PartitionedFileSourceBase method getExtractForFile.
private Extract getExtractForFile(PartitionAwareFileRetriever.FileInfo file, String topicName, String namespace, Map<Long, Extract> extractMap) {
Extract extract = extractMap.get(file.getWatermarkMsSinceEpoch());
if (extract == null) {
// Create an extract object for the dayPath
extract = new Extract(this.tableType, namespace, topicName);
LOG.info("Created extract: " + extract.getExtractId() + " for path " + topicName);
extractMap.put(file.getWatermarkMsSinceEpoch(), extract);
}
return extract;
}
use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.
the class CopySourceTest method testCopySource.
@Test
public void testCopySource() throws Exception {
SourceState state = new SourceState();
state.setProp(ConfigurationKeys.SOURCE_FILEBASED_FS_URI, "file:///");
state.setProp(ConfigurationKeys.WRITER_FILE_SYSTEM_URI, "file:///");
state.setProp(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, "/target/dir");
state.setProp(DatasetUtils.DATASET_PROFILE_CLASS_KEY, TestCopyableDatasetFinder.class.getName());
CopySource source = new CopySource();
List<WorkUnit> workunits = source.getWorkunits(state);
workunits = JobLauncherUtils.flattenWorkUnits(workunits);
Assert.assertEquals(workunits.size(), TestCopyableDataset.FILE_COUNT);
Extract extract = workunits.get(0).getExtract();
for (WorkUnit workUnit : workunits) {
CopyableFile file = (CopyableFile) CopySource.deserializeCopyEntity(workUnit);
Assert.assertTrue(file.getOrigin().getPath().toString().startsWith(TestCopyableDataset.ORIGIN_PREFIX));
Assert.assertEquals(file.getDestinationOwnerAndPermission(), TestCopyableDataset.OWNER_AND_PERMISSION);
Assert.assertEquals(workUnit.getExtract(), extract);
}
}
use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.
the class KafkaSource method getWorkUnitForTopicPartition.
private WorkUnit getWorkUnitForTopicPartition(KafkaPartition partition, Offsets offsets, Optional<State> topicSpecificState) {
// Default to job level configurations
Extract.TableType currentTableType = tableType;
String currentExtractNamespace = extractNamespace;
String currentExtractTableName = partition.getTopicName();
boolean isCurrentFullExtract = isFullExtract;
// Update to topic specific configurations if any
if (topicSpecificState.isPresent()) {
State topicState = topicSpecificState.get();
if (topicState.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)) {
currentTableType = Extract.TableType.valueOf(topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY));
}
currentExtractNamespace = topicState.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, extractNamespace);
currentExtractTableName = topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, partition.getTopicName());
isCurrentFullExtract = topicState.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY, isFullExtract);
}
Extract extract = this.createExtract(currentTableType, currentExtractNamespace, currentExtractTableName);
if (isCurrentFullExtract) {
extract.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, true);
}
WorkUnit workUnit = WorkUnit.create(extract);
workUnit.setProp(TOPIC_NAME, partition.getTopicName());
addDatasetUrnOptionally(workUnit);
workUnit.setProp(PARTITION_ID, partition.getId());
workUnit.setProp(LEADER_ID, partition.getLeader().getId());
workUnit.setProp(LEADER_HOSTANDPORT, partition.getLeader().getHostAndPort().toString());
workUnit.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, offsets.getStartOffset());
workUnit.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, offsets.getLatestOffset());
workUnit.setProp(PREVIOUS_OFFSET_FETCH_EPOCH_TIME, offsets.getPreviousOffsetFetchEpochTime());
workUnit.setProp(OFFSET_FETCH_EPOCH_TIME, offsets.getOffsetFetchEpochTime());
workUnit.setProp(PREVIOUS_LATEST_OFFSET, offsets.getPreviousLatestOffset());
// Add lineage info
DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_KAFKA, partition.getTopicName());
source.addMetadata(DatasetConstants.BROKERS, kafkaBrokers);
if (this.lineageInfo.isPresent()) {
this.lineageInfo.get().setSource(source, workUnit);
}
LOG.info(String.format("Created workunit for partition %s: lowWatermark=%d, highWatermark=%d, range=%d", partition, offsets.getStartOffset(), offsets.getLatestOffset(), offsets.getLatestOffset() - offsets.getStartOffset()));
return workUnit;
}
use of org.apache.gobblin.source.workunit.Extract in project incubator-gobblin by apache.
the class TaskState method toTaskExecutionInfo.
/**
* Convert this {@link TaskState} instance to a {@link TaskExecutionInfo} instance.
*
* @return a {@link TaskExecutionInfo} instance
*/
public TaskExecutionInfo toTaskExecutionInfo() {
TaskExecutionInfo taskExecutionInfo = new TaskExecutionInfo();
taskExecutionInfo.setJobId(this.jobId);
taskExecutionInfo.setTaskId(this.taskId);
if (this.startTime > 0) {
taskExecutionInfo.setStartTime(this.startTime);
}
if (this.endTime > 0) {
taskExecutionInfo.setEndTime(this.endTime);
}
taskExecutionInfo.setDuration(this.duration);
taskExecutionInfo.setState(TaskStateEnum.valueOf(getWorkingState().name()));
if (this.contains(ConfigurationKeys.TASK_FAILURE_EXCEPTION_KEY)) {
taskExecutionInfo.setFailureException(this.getProp(ConfigurationKeys.TASK_FAILURE_EXCEPTION_KEY));
}
taskExecutionInfo.setHighWatermark(this.getHighWaterMark());
// Add extract/table information
Table table = new Table();
Extract extract = this.getExtract();
table.setNamespace(extract.getNamespace());
table.setName(extract.getTable());
if (extract.hasType()) {
table.setType(TableTypeEnum.valueOf(extract.getType().name()));
}
taskExecutionInfo.setTable(table);
// Add task metrics
TaskMetrics taskMetrics = TaskMetrics.get(this);
MetricArray metricArray = new MetricArray();
for (Map.Entry<String, ? extends com.codahale.metrics.Metric> entry : taskMetrics.getMetricContext().getCounters().entrySet()) {
Metric counter = new Metric();
counter.setGroup(MetricGroup.TASK.name());
counter.setName(entry.getKey());
counter.setType(MetricTypeEnum.valueOf(GobblinMetrics.MetricType.COUNTER.name()));
counter.setValue(Long.toString(((Counter) entry.getValue()).getCount()));
metricArray.add(counter);
}
for (Map.Entry<String, ? extends com.codahale.metrics.Metric> entry : taskMetrics.getMetricContext().getMeters().entrySet()) {
Metric meter = new Metric();
meter.setGroup(MetricGroup.TASK.name());
meter.setName(entry.getKey());
meter.setType(MetricTypeEnum.valueOf(GobblinMetrics.MetricType.METER.name()));
meter.setValue(Double.toString(((Meter) entry.getValue()).getMeanRate()));
metricArray.add(meter);
}
for (Map.Entry<String, ? extends com.codahale.metrics.Metric> entry : taskMetrics.getMetricContext().getGauges().entrySet()) {
Metric gauge = new Metric();
gauge.setGroup(MetricGroup.TASK.name());
gauge.setName(entry.getKey());
gauge.setType(MetricTypeEnum.valueOf(GobblinMetrics.MetricType.GAUGE.name()));
gauge.setValue(((Gauge<?>) entry.getValue()).getValue().toString());
metricArray.add(gauge);
}
taskExecutionInfo.setMetrics(metricArray);
// Add task properties
Map<String, String> taskProperties = Maps.newHashMap();
for (String name : this.getPropertyNames()) {
String value = this.getProp(name);
if (!Strings.isNullOrEmpty(value))
taskProperties.put(name, value);
}
taskExecutionInfo.setTaskProperties(new StringMap(taskProperties));
return taskExecutionInfo;
}
Aggregations