use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class BaseDataPublisherTest method testPublishSingleTask.
@Test
public void testPublishSingleTask() throws IOException {
WorkUnitState state = buildTaskState(1);
LineageInfo lineageInfo = LineageInfo.getLineageInfo(state.getTaskBroker()).get();
DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic");
lineageInfo.setSource(source, state);
BaseDataPublisher publisher = new BaseDataPublisher(state);
publisher.publishData(state);
Assert.assertTrue(state.contains("gobblin.event.lineage.branch.0.destination"));
Assert.assertFalse(state.contains("gobblin.event.lineage.branch.1.destination"));
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class CopyableFile method setFsDatasets.
/**
* Set file system based source and destination dataset for this {@link CopyableFile}
*
* @param originFs {@link FileSystem} where this {@link CopyableFile} origins
* @param targetFs {@link FileSystem} where this {@link CopyableFile} is copied to
*/
public void setFsDatasets(FileSystem originFs, FileSystem targetFs) {
/*
* By default, the raw Gobblin dataset for CopyableFile lineage is its parent folder
* if itself is not a folder
*/
boolean isDir = origin.isDirectory();
Path fullSourcePath = Path.getPathWithoutSchemeAndAuthority(origin.getPath());
String sourceDatasetName = isDir ? fullSourcePath.toString() : fullSourcePath.getParent().toString();
sourceDataset = new DatasetDescriptor(originFs.getScheme(), sourceDatasetName);
sourceDataset.addMetadata(DatasetConstants.FS_URI, originFs.getUri().toString());
Path fullDestinationPath = Path.getPathWithoutSchemeAndAuthority(destination);
String destinationDatasetName = isDir ? fullDestinationPath.toString() : fullDestinationPath.getParent().toString();
destinationDataset = new DatasetDescriptor(targetFs.getScheme(), destinationDatasetName);
destinationDataset.addMetadata(DatasetConstants.FS_URI, targetFs.getUri().toString());
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class KafkaSource method getWorkUnitForTopicPartition.
private WorkUnit getWorkUnitForTopicPartition(KafkaPartition partition, Offsets offsets, Optional<State> topicSpecificState) {
// Default to job level configurations
Extract.TableType currentTableType = tableType;
String currentExtractNamespace = extractNamespace;
String currentExtractTableName = partition.getTopicName();
boolean isCurrentFullExtract = isFullExtract;
// Update to topic specific configurations if any
if (topicSpecificState.isPresent()) {
State topicState = topicSpecificState.get();
if (topicState.contains(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY)) {
currentTableType = Extract.TableType.valueOf(topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY));
}
currentExtractNamespace = topicState.getProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, extractNamespace);
currentExtractTableName = topicState.getProp(ConfigurationKeys.EXTRACT_TABLE_NAME_KEY, partition.getTopicName());
isCurrentFullExtract = topicState.getPropAsBoolean(ConfigurationKeys.EXTRACT_IS_FULL_KEY, isFullExtract);
}
Extract extract = this.createExtract(currentTableType, currentExtractNamespace, currentExtractTableName);
if (isCurrentFullExtract) {
extract.setProp(ConfigurationKeys.EXTRACT_IS_FULL_KEY, true);
}
WorkUnit workUnit = WorkUnit.create(extract);
workUnit.setProp(TOPIC_NAME, partition.getTopicName());
addDatasetUrnOptionally(workUnit);
workUnit.setProp(PARTITION_ID, partition.getId());
workUnit.setProp(LEADER_ID, partition.getLeader().getId());
workUnit.setProp(LEADER_HOSTANDPORT, partition.getLeader().getHostAndPort().toString());
workUnit.setProp(ConfigurationKeys.WORK_UNIT_LOW_WATER_MARK_KEY, offsets.getStartOffset());
workUnit.setProp(ConfigurationKeys.WORK_UNIT_HIGH_WATER_MARK_KEY, offsets.getLatestOffset());
workUnit.setProp(PREVIOUS_OFFSET_FETCH_EPOCH_TIME, offsets.getPreviousOffsetFetchEpochTime());
workUnit.setProp(OFFSET_FETCH_EPOCH_TIME, offsets.getOffsetFetchEpochTime());
workUnit.setProp(PREVIOUS_LATEST_OFFSET, offsets.getPreviousLatestOffset());
// Add lineage info
DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_KAFKA, partition.getTopicName());
source.addMetadata(DatasetConstants.BROKERS, kafkaBrokers);
if (this.lineageInfo.isPresent()) {
this.lineageInfo.get().setSource(source, workUnit);
}
LOG.info(String.format("Created workunit for partition %s: lowWatermark=%d, highWatermark=%d, range=%d", partition, offsets.getStartOffset(), offsets.getLatestOffset(), offsets.getLatestOffset() - offsets.getStartOffset()));
return workUnit;
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class MysqlSource method addLineageSourceInfo.
protected void addLineageSourceInfo(SourceState sourceState, SourceEntity entity, WorkUnit workUnit) {
String host = sourceState.getProp(ConfigurationKeys.SOURCE_CONN_HOST_NAME);
String port = sourceState.getProp(ConfigurationKeys.SOURCE_CONN_PORT);
String database = sourceState.getProp(ConfigurationKeys.SOURCE_QUERYBASED_SCHEMA);
String connectionUrl = "jdbc:mysql://" + host.trim() + ":" + port + "/" + database.trim();
DatasetDescriptor source = new DatasetDescriptor(DatasetConstants.PLATFORM_MYSQL, database + "." + entity.getSourceEntityName());
source.addMetadata(DatasetConstants.CONNECTION_URL, connectionUrl);
if (lineageInfo.isPresent()) {
lineageInfo.get().setSource(source, workUnit);
}
}
use of org.apache.gobblin.dataset.DatasetDescriptor in project incubator-gobblin by apache.
the class SalesforceSourceTest method testSourceLineageInfo.
@Test
void testSourceLineageInfo() {
SourceState sourceState = new SourceState();
sourceState.setProp(ConfigurationKeys.EXTRACT_NAMESPACE_NAME_KEY, "salesforce");
sourceState.setProp(ConfigurationKeys.EXTRACT_TABLE_TYPE_KEY, "snapshot_append");
sourceState.setProp(Partitioner.HAS_USER_SPECIFIED_PARTITIONS, true);
sourceState.setProp(Partitioner.USER_SPECIFIED_PARTITIONS, "20140213000000,20170407152123");
sourceState.setProp(ConfigurationKeys.SOURCE_QUERYBASED_EXTRACT_TYPE, "SNAPSHOT");
QueryBasedSource.SourceEntity sourceEntity = QueryBasedSource.SourceEntity.fromSourceEntityName("contacts");
SalesforceSource source = new SalesforceSource(new LineageInfo(ConfigFactory.empty()));
List<WorkUnit> workUnits = source.generateWorkUnits(sourceEntity, sourceState, 20140213000000L);
Assert.assertEquals(workUnits.size(), 1);
DatasetDescriptor sourceDataset = new DatasetDescriptor("salesforce", "contacts");
Gson gson = new Gson();
Assert.assertEquals(gson.toJson(sourceDataset), workUnits.get(0).getProp("gobblin.event.lineage.source"));
Assert.assertEquals(workUnits.get(0).getProp("gobblin.event.lineage.name"), "contacts");
}
Aggregations