use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.
the class BaseDataPublisher method addLineageInfo.
private void addLineageInfo(WorkUnitState state, int branchId) {
if (!this.lineageInfo.isPresent()) {
LOG.info("Will not add lineage info");
return;
}
// Final dataset descriptor
DatasetDescriptor datasetDescriptor = createDestinationDescriptor(state, branchId);
List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, branchId);
List<Descriptor> descriptors = new ArrayList<>();
if (partitions.size() == 0) {
// Report as dataset level lineage
descriptors.add(datasetDescriptor);
} else {
// Report as partition level lineage
for (PartitionDescriptor partition : partitions) {
descriptors.add(partition.copyWithNewDataset(datasetDescriptor));
}
}
this.lineageInfo.get().putDestination(descriptors, branchId, state);
}
use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.
the class BaseDataPublisherTest method testPublishedPartitionsLineage.
/**
* Test partition level lineages are set
*/
@Test
public void testPublishedPartitionsLineage() throws IOException {
int numBranches = 2;
int numPartitionsPerBranch = 2;
WorkUnitState state = buildTaskState(numBranches);
LineageInfo lineageInfo = LineageInfo.getLineageInfo(state.getTaskBroker()).get();
DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic");
lineageInfo.setSource(source, state);
BaseDataPublisher publisher = new BaseDataPublisher(state);
// Set up writer partition descriptors
DatasetDescriptor datasetAtWriter = new DatasetDescriptor("dummy", "dummy");
for (int i = 0; i < numBranches; i++) {
List<PartitionDescriptor> partitions = new ArrayList<>();
for (int j = 0; j < numPartitionsPerBranch; j++) {
// Dummy dataset descriptor will be discarded by publisher
partitions.add(new PartitionDescriptor("partition" + i + j, datasetAtWriter));
}
String partitionsKey = "writer." + i + ".partitions";
state.setProp(partitionsKey, GSON.toJson(partitions, PARTITION_LIST_TYPE));
}
publisher.publish(ImmutableList.of(state));
Assert.assertTrue(state.contains("gobblin.event.lineage.branch.0.destination"));
Assert.assertTrue(state.contains("gobblin.event.lineage.branch.1.destination"));
Collection<LineageEventBuilder> events = LineageInfo.load(ImmutableList.of(state));
Assert.assertTrue(events.size() == 4);
// Find the partition lineage and assert
for (int i = 0; i < numBranches; i++) {
String outputPath = String.format("/data/output/branch%d/namespace/table", i);
DatasetDescriptor destinationDataset = new DatasetDescriptor("file", URI.create("file:///"), outputPath);
destinationDataset.addMetadata("fsUri", "file:///");
destinationDataset.addMetadata("branch", "" + i);
for (int j = 0; j < numPartitionsPerBranch; j++) {
LineageEventBuilder event = find(events, "partition" + i + j);
Assert.assertTrue(null != event);
Assert.assertEquals(event.getSource(), source);
Assert.assertEquals(event.getDestination(), // Dataset written by the writer is discarded
new PartitionDescriptor("partition" + i + j, destinationDataset));
}
}
}
use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.
the class HivePartitionFileSet method generateCopyEntities.
@Override
protected Collection<CopyEntity> generateCopyEntities() throws IOException {
try (Closer closer = Closer.create()) {
MultiTimingEvent multiTimer = closer.register(new MultiTimingEvent(this.eventSubmitter, "PartitionCopy", true));
int stepPriority = 0;
String fileSet = HiveCopyEntityHelper.gson.toJson(this.partition.getValues());
List<CopyEntity> copyEntities = Lists.newArrayList();
stepPriority = hiveCopyEntityHelper.addSharedSteps(copyEntities, fileSet, stepPriority);
multiTimer.nextStage(HiveCopyEntityHelper.Stages.COMPUTE_TARGETS);
Path targetPath = hiveCopyEntityHelper.getTargetLocation(hiveCopyEntityHelper.getTargetFs(), this.partition.getDataLocation(), Optional.of(this.partition));
Partition targetPartition = getTargetPartition(this.partition, targetPath);
multiTimer.nextStage(HiveCopyEntityHelper.Stages.EXISTING_PARTITION);
if (this.existingTargetPartition.isPresent()) {
hiveCopyEntityHelper.getTargetPartitions().remove(this.partition.getValues());
try {
checkPartitionCompatibility(targetPartition, this.existingTargetPartition.get());
} catch (IOException ioe) {
if (hiveCopyEntityHelper.getExistingEntityPolicy() != HiveCopyEntityHelper.ExistingEntityPolicy.REPLACE_PARTITIONS && hiveCopyEntityHelper.getExistingEntityPolicy() != HiveCopyEntityHelper.ExistingEntityPolicy.REPLACE_TABLE_AND_PARTITIONS) {
log.error("Source and target partitions are not compatible. Aborting copy of partition " + this.partition, ioe);
// Silence error and continue processing workunits if we allow partial success
if (ConfigUtils.getString(hiveCopyEntityHelper.getConfiguration().getConfig(), ConfigurationKeys.JOB_COMMIT_POLICY_KEY, JobCommitPolicy.COMMIT_ON_FULL_SUCCESS.toString()).equals(JobCommitPolicy.COMMIT_SUCCESSFUL_TASKS.toString())) {
return Lists.newArrayList();
} else {
throw ioe;
}
}
log.warn("Source and target partitions are not compatible. Will override target partition: " + ioe.getMessage());
log.debug("Incompatibility details: ", ioe);
stepPriority = hiveCopyEntityHelper.addPartitionDeregisterSteps(copyEntities, fileSet, stepPriority, hiveCopyEntityHelper.getTargetTable(), this.existingTargetPartition.get());
this.existingTargetPartition = Optional.absent();
}
}
multiTimer.nextStage(HiveCopyEntityHelper.Stages.PARTITION_SKIP_PREDICATE);
if (hiveCopyEntityHelper.getFastPartitionSkip().isPresent() && hiveCopyEntityHelper.getFastPartitionSkip().get().apply(this)) {
log.info(String.format("Skipping copy of partition %s due to fast partition skip predicate.", this.partition.getCompleteName()));
return Lists.newArrayList();
}
HiveSpec partitionHiveSpec = new SimpleHiveSpec.Builder<>(targetPath).withTable(HiveMetaStoreUtils.getHiveTable(hiveCopyEntityHelper.getTargetTable().getTTable())).withPartition(Optional.of(HiveMetaStoreUtils.getHivePartition(targetPartition.getTPartition()))).build();
HiveRegisterStep register = new HiveRegisterStep(hiveCopyEntityHelper.getTargetMetastoreURI(), partitionHiveSpec, hiveCopyEntityHelper.getHiveRegProps());
copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), register, stepPriority++));
multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_LOCATIONS);
HiveLocationDescriptor sourceLocation = HiveLocationDescriptor.forPartition(this.partition, hiveCopyEntityHelper.getDataset().fs, this.properties);
HiveLocationDescriptor desiredTargetLocation = HiveLocationDescriptor.forPartition(targetPartition, hiveCopyEntityHelper.getTargetFs(), this.properties);
Optional<HiveLocationDescriptor> existingTargetLocation = this.existingTargetPartition.isPresent() ? Optional.of(HiveLocationDescriptor.forPartition(this.existingTargetPartition.get(), hiveCopyEntityHelper.getTargetFs(), this.properties)) : Optional.<HiveLocationDescriptor>absent();
multiTimer.nextStage(HiveCopyEntityHelper.Stages.FULL_PATH_DIFF);
HiveCopyEntityHelper.DiffPathSet diffPathSet = HiveCopyEntityHelper.fullPathDiff(sourceLocation, desiredTargetLocation, existingTargetLocation, Optional.<Partition>absent(), multiTimer, hiveCopyEntityHelper);
multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_DELETE_UNITS);
if (diffPathSet.pathsToDelete.size() > 0) {
DeleteFileCommitStep deleteStep = DeleteFileCommitStep.fromPaths(hiveCopyEntityHelper.getTargetFs(), diffPathSet.pathsToDelete, hiveCopyEntityHelper.getDataset().properties);
copyEntities.add(new PrePublishStep(fileSet, Maps.<String, String>newHashMap(), deleteStep, stepPriority++));
}
multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_COPY_UNITS);
for (CopyableFile.Builder builder : hiveCopyEntityHelper.getCopyableFilesFromPaths(diffPathSet.filesToCopy, hiveCopyEntityHelper.getConfiguration(), Optional.of(this.partition))) {
CopyableFile fileEntity = builder.fileSet(fileSet).checksum(new byte[0]).datasetOutputPath(desiredTargetLocation.location.toString()).build();
DatasetDescriptor sourceDataset = this.hiveCopyEntityHelper.getSourceDataset();
PartitionDescriptor source = new PartitionDescriptor(partition.getName(), sourceDataset);
fileEntity.setSourceData(source);
DatasetDescriptor destinationDataset = this.hiveCopyEntityHelper.getDestinationDataset();
Partition destinationPartition = this.existingTargetPartition.isPresent() ? this.existingTargetPartition.get() : partition;
PartitionDescriptor destination = new PartitionDescriptor(destinationPartition.getName(), destinationDataset);
fileEntity.setDestinationData(destination);
copyEntities.add(fileEntity);
}
log.info("Created {} copy entities for partition {}", copyEntities.size(), this.partition.getCompleteName());
return copyEntities;
}
}
use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.
the class CopyableFileTest method testSerializeDeserialze.
@Test
public void testSerializeDeserialze() throws Exception {
CopyableFile copyableFile = new CopyableFile(new FileStatus(10, false, 12, 100, 12345, new Path("/path")), new Path("/destination"), new OwnerAndPermission("owner", "group", FsPermission.getDefault()), Lists.newArrayList(new OwnerAndPermission("owner2", "group2", FsPermission.getDefault())), "checksum".getBytes(), PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps.<String, String>newHashMap(), "", null);
DatasetDescriptor dataset = new DatasetDescriptor("hive", "db.table");
PartitionDescriptor descriptor = new PartitionDescriptor("datepartition=2018/09/05", dataset);
copyableFile.setDestinationData(descriptor);
String s = CopyEntity.serialize(copyableFile);
CopyEntity de = CopyEntity.deserialize(s);
Assert.assertEquals(de, copyableFile);
}
use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.
the class LineageEventTest method testMultiPuts.
@Test
public void testMultiPuts() {
final String topic = "testTopic";
final String kafka = "kafka";
final String hdfs = "hdfs";
final String path = "/data/tracking/PageViewEvent";
final String partitionName = "hourly/2018/08/15/15";
State state = new State();
LineageInfo lineageInfo = getLineageInfo();
DatasetDescriptor source = new DatasetDescriptor(kafka, topic);
lineageInfo.setSource(source, state);
DatasetDescriptor destinationDataset = new DatasetDescriptor(hdfs, path);
PartitionDescriptor destination = new PartitionDescriptor(partitionName, destinationDataset);
lineageInfo.putDestination(Lists.newArrayList(destination), 0, state);
// Put another destination
DatasetDescriptor destinationDataset2 = new DatasetDescriptor(kafka, "nextTopic");
lineageInfo.putDestination(Lists.newArrayList(destinationDataset2), 0, state);
Map<String, Set<LineageEventBuilder>> eventsMap = LineageInfo.load(state);
Assert.assertEquals(eventsMap.size(), 1);
Set<LineageEventBuilder> events = eventsMap.get("0");
Assert.assertEquals(events.size(), 2);
verifyOne(events, topic, source, destination);
verifyOne(events, topic, source, destinationDataset2);
}
Aggregations