Search in sources :

Example 1 with PartitionDescriptor

use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.

the class BaseDataPublisher method addLineageInfo.

private void addLineageInfo(WorkUnitState state, int branchId) {
    if (!this.lineageInfo.isPresent()) {
        LOG.info("Will not add lineage info");
        return;
    }
    // Final dataset descriptor
    DatasetDescriptor datasetDescriptor = createDestinationDescriptor(state, branchId);
    List<PartitionDescriptor> partitions = PartitionedDataWriter.getPartitionInfoAndClean(state, branchId);
    List<Descriptor> descriptors = new ArrayList<>();
    if (partitions.size() == 0) {
        // Report as dataset level lineage
        descriptors.add(datasetDescriptor);
    } else {
        // Report as partition level lineage
        for (PartitionDescriptor partition : partitions) {
            descriptors.add(partition.copyWithNewDataset(datasetDescriptor));
        }
    }
    this.lineageInfo.get().putDestination(descriptors, branchId, state);
}
Also used : DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) ArrayList(java.util.ArrayList) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor) Descriptor(org.apache.gobblin.dataset.Descriptor) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor)

Example 2 with PartitionDescriptor

use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.

the class BaseDataPublisherTest method testPublishedPartitionsLineage.

/**
 * Test partition level lineages are set
 */
@Test
public void testPublishedPartitionsLineage() throws IOException {
    int numBranches = 2;
    int numPartitionsPerBranch = 2;
    WorkUnitState state = buildTaskState(numBranches);
    LineageInfo lineageInfo = LineageInfo.getLineageInfo(state.getTaskBroker()).get();
    DatasetDescriptor source = new DatasetDescriptor("kafka", "testTopic");
    lineageInfo.setSource(source, state);
    BaseDataPublisher publisher = new BaseDataPublisher(state);
    // Set up writer partition descriptors
    DatasetDescriptor datasetAtWriter = new DatasetDescriptor("dummy", "dummy");
    for (int i = 0; i < numBranches; i++) {
        List<PartitionDescriptor> partitions = new ArrayList<>();
        for (int j = 0; j < numPartitionsPerBranch; j++) {
            // Dummy dataset descriptor will be discarded by publisher
            partitions.add(new PartitionDescriptor("partition" + i + j, datasetAtWriter));
        }
        String partitionsKey = "writer." + i + ".partitions";
        state.setProp(partitionsKey, GSON.toJson(partitions, PARTITION_LIST_TYPE));
    }
    publisher.publish(ImmutableList.of(state));
    Assert.assertTrue(state.contains("gobblin.event.lineage.branch.0.destination"));
    Assert.assertTrue(state.contains("gobblin.event.lineage.branch.1.destination"));
    Collection<LineageEventBuilder> events = LineageInfo.load(ImmutableList.of(state));
    Assert.assertTrue(events.size() == 4);
    // Find the partition lineage and assert
    for (int i = 0; i < numBranches; i++) {
        String outputPath = String.format("/data/output/branch%d/namespace/table", i);
        DatasetDescriptor destinationDataset = new DatasetDescriptor("file", URI.create("file:///"), outputPath);
        destinationDataset.addMetadata("fsUri", "file:///");
        destinationDataset.addMetadata("branch", "" + i);
        for (int j = 0; j < numPartitionsPerBranch; j++) {
            LineageEventBuilder event = find(events, "partition" + i + j);
            Assert.assertTrue(null != event);
            Assert.assertEquals(event.getSource(), source);
            Assert.assertEquals(event.getDestination(), // Dataset written by the writer is discarded
            new PartitionDescriptor("partition" + i + j, destinationDataset));
        }
    }
}
Also used : DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) WorkUnitState(org.apache.gobblin.configuration.WorkUnitState) ArrayList(java.util.ArrayList) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor) LineageEventBuilder(org.apache.gobblin.metrics.event.lineage.LineageEventBuilder) LineageInfo(org.apache.gobblin.metrics.event.lineage.LineageInfo) Test(org.testng.annotations.Test)

Example 3 with PartitionDescriptor

use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.

the class HivePartitionFileSet method generateCopyEntities.

@Override
protected Collection<CopyEntity> generateCopyEntities() throws IOException {
    try (Closer closer = Closer.create()) {
        MultiTimingEvent multiTimer = closer.register(new MultiTimingEvent(this.eventSubmitter, "PartitionCopy", true));
        int stepPriority = 0;
        String fileSet = HiveCopyEntityHelper.gson.toJson(this.partition.getValues());
        List<CopyEntity> copyEntities = Lists.newArrayList();
        stepPriority = hiveCopyEntityHelper.addSharedSteps(copyEntities, fileSet, stepPriority);
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.COMPUTE_TARGETS);
        Path targetPath = hiveCopyEntityHelper.getTargetLocation(hiveCopyEntityHelper.getTargetFs(), this.partition.getDataLocation(), Optional.of(this.partition));
        Partition targetPartition = getTargetPartition(this.partition, targetPath);
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.EXISTING_PARTITION);
        if (this.existingTargetPartition.isPresent()) {
            hiveCopyEntityHelper.getTargetPartitions().remove(this.partition.getValues());
            try {
                checkPartitionCompatibility(targetPartition, this.existingTargetPartition.get());
            } catch (IOException ioe) {
                if (hiveCopyEntityHelper.getExistingEntityPolicy() != HiveCopyEntityHelper.ExistingEntityPolicy.REPLACE_PARTITIONS && hiveCopyEntityHelper.getExistingEntityPolicy() != HiveCopyEntityHelper.ExistingEntityPolicy.REPLACE_TABLE_AND_PARTITIONS) {
                    log.error("Source and target partitions are not compatible. Aborting copy of partition " + this.partition, ioe);
                    // Silence error and continue processing workunits if we allow partial success
                    if (ConfigUtils.getString(hiveCopyEntityHelper.getConfiguration().getConfig(), ConfigurationKeys.JOB_COMMIT_POLICY_KEY, JobCommitPolicy.COMMIT_ON_FULL_SUCCESS.toString()).equals(JobCommitPolicy.COMMIT_SUCCESSFUL_TASKS.toString())) {
                        return Lists.newArrayList();
                    } else {
                        throw ioe;
                    }
                }
                log.warn("Source and target partitions are not compatible. Will override target partition: " + ioe.getMessage());
                log.debug("Incompatibility details: ", ioe);
                stepPriority = hiveCopyEntityHelper.addPartitionDeregisterSteps(copyEntities, fileSet, stepPriority, hiveCopyEntityHelper.getTargetTable(), this.existingTargetPartition.get());
                this.existingTargetPartition = Optional.absent();
            }
        }
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.PARTITION_SKIP_PREDICATE);
        if (hiveCopyEntityHelper.getFastPartitionSkip().isPresent() && hiveCopyEntityHelper.getFastPartitionSkip().get().apply(this)) {
            log.info(String.format("Skipping copy of partition %s due to fast partition skip predicate.", this.partition.getCompleteName()));
            return Lists.newArrayList();
        }
        HiveSpec partitionHiveSpec = new SimpleHiveSpec.Builder<>(targetPath).withTable(HiveMetaStoreUtils.getHiveTable(hiveCopyEntityHelper.getTargetTable().getTTable())).withPartition(Optional.of(HiveMetaStoreUtils.getHivePartition(targetPartition.getTPartition()))).build();
        HiveRegisterStep register = new HiveRegisterStep(hiveCopyEntityHelper.getTargetMetastoreURI(), partitionHiveSpec, hiveCopyEntityHelper.getHiveRegProps());
        copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), register, stepPriority++));
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_LOCATIONS);
        HiveLocationDescriptor sourceLocation = HiveLocationDescriptor.forPartition(this.partition, hiveCopyEntityHelper.getDataset().fs, this.properties);
        HiveLocationDescriptor desiredTargetLocation = HiveLocationDescriptor.forPartition(targetPartition, hiveCopyEntityHelper.getTargetFs(), this.properties);
        Optional<HiveLocationDescriptor> existingTargetLocation = this.existingTargetPartition.isPresent() ? Optional.of(HiveLocationDescriptor.forPartition(this.existingTargetPartition.get(), hiveCopyEntityHelper.getTargetFs(), this.properties)) : Optional.<HiveLocationDescriptor>absent();
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.FULL_PATH_DIFF);
        HiveCopyEntityHelper.DiffPathSet diffPathSet = HiveCopyEntityHelper.fullPathDiff(sourceLocation, desiredTargetLocation, existingTargetLocation, Optional.<Partition>absent(), multiTimer, hiveCopyEntityHelper);
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_DELETE_UNITS);
        if (diffPathSet.pathsToDelete.size() > 0) {
            DeleteFileCommitStep deleteStep = DeleteFileCommitStep.fromPaths(hiveCopyEntityHelper.getTargetFs(), diffPathSet.pathsToDelete, hiveCopyEntityHelper.getDataset().properties);
            copyEntities.add(new PrePublishStep(fileSet, Maps.<String, String>newHashMap(), deleteStep, stepPriority++));
        }
        multiTimer.nextStage(HiveCopyEntityHelper.Stages.CREATE_COPY_UNITS);
        for (CopyableFile.Builder builder : hiveCopyEntityHelper.getCopyableFilesFromPaths(diffPathSet.filesToCopy, hiveCopyEntityHelper.getConfiguration(), Optional.of(this.partition))) {
            CopyableFile fileEntity = builder.fileSet(fileSet).checksum(new byte[0]).datasetOutputPath(desiredTargetLocation.location.toString()).build();
            DatasetDescriptor sourceDataset = this.hiveCopyEntityHelper.getSourceDataset();
            PartitionDescriptor source = new PartitionDescriptor(partition.getName(), sourceDataset);
            fileEntity.setSourceData(source);
            DatasetDescriptor destinationDataset = this.hiveCopyEntityHelper.getDestinationDataset();
            Partition destinationPartition = this.existingTargetPartition.isPresent() ? this.existingTargetPartition.get() : partition;
            PartitionDescriptor destination = new PartitionDescriptor(destinationPartition.getName(), destinationDataset);
            fileEntity.setDestinationData(destination);
            copyEntities.add(fileEntity);
        }
        log.info("Created {} copy entities for partition {}", copyEntities.size(), this.partition.getCompleteName());
        return copyEntities;
    }
}
Also used : Closer(com.google.common.io.Closer) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) MultiTimingEvent(org.apache.gobblin.metrics.event.MultiTimingEvent) IOException(java.io.IOException) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) HiveRegisterStep(org.apache.gobblin.hive.HiveRegisterStep) SimpleHiveSpec(org.apache.gobblin.hive.spec.SimpleHiveSpec) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor) PrePublishStep(org.apache.gobblin.data.management.copy.entities.PrePublishStep) SimpleHiveSpec(org.apache.gobblin.hive.spec.SimpleHiveSpec) HiveSpec(org.apache.gobblin.hive.spec.HiveSpec)

Example 4 with PartitionDescriptor

use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.

the class CopyableFileTest method testSerializeDeserialze.

@Test
public void testSerializeDeserialze() throws Exception {
    CopyableFile copyableFile = new CopyableFile(new FileStatus(10, false, 12, 100, 12345, new Path("/path")), new Path("/destination"), new OwnerAndPermission("owner", "group", FsPermission.getDefault()), Lists.newArrayList(new OwnerAndPermission("owner2", "group2", FsPermission.getDefault())), "checksum".getBytes(), PreserveAttributes.fromMnemonicString(""), "", 0, 0, Maps.<String, String>newHashMap(), "", null);
    DatasetDescriptor dataset = new DatasetDescriptor("hive", "db.table");
    PartitionDescriptor descriptor = new PartitionDescriptor("datepartition=2018/09/05", dataset);
    copyableFile.setDestinationData(descriptor);
    String s = CopyEntity.serialize(copyableFile);
    CopyEntity de = CopyEntity.deserialize(s);
    Assert.assertEquals(de, copyableFile);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor) Test(org.testng.annotations.Test)

Example 5 with PartitionDescriptor

use of org.apache.gobblin.dataset.PartitionDescriptor in project incubator-gobblin by apache.

the class LineageEventTest method testMultiPuts.

@Test
public void testMultiPuts() {
    final String topic = "testTopic";
    final String kafka = "kafka";
    final String hdfs = "hdfs";
    final String path = "/data/tracking/PageViewEvent";
    final String partitionName = "hourly/2018/08/15/15";
    State state = new State();
    LineageInfo lineageInfo = getLineageInfo();
    DatasetDescriptor source = new DatasetDescriptor(kafka, topic);
    lineageInfo.setSource(source, state);
    DatasetDescriptor destinationDataset = new DatasetDescriptor(hdfs, path);
    PartitionDescriptor destination = new PartitionDescriptor(partitionName, destinationDataset);
    lineageInfo.putDestination(Lists.newArrayList(destination), 0, state);
    // Put another destination
    DatasetDescriptor destinationDataset2 = new DatasetDescriptor(kafka, "nextTopic");
    lineageInfo.putDestination(Lists.newArrayList(destinationDataset2), 0, state);
    Map<String, Set<LineageEventBuilder>> eventsMap = LineageInfo.load(state);
    Assert.assertEquals(eventsMap.size(), 1);
    Set<LineageEventBuilder> events = eventsMap.get("0");
    Assert.assertEquals(events.size(), 2);
    verifyOne(events, topic, source, destination);
    verifyOne(events, topic, source, destinationDataset2);
}
Also used : Set(java.util.Set) DatasetDescriptor(org.apache.gobblin.dataset.DatasetDescriptor) State(org.apache.gobblin.configuration.State) PartitionDescriptor(org.apache.gobblin.dataset.PartitionDescriptor) Test(org.testng.annotations.Test)

Aggregations

PartitionDescriptor (org.apache.gobblin.dataset.PartitionDescriptor)10 DatasetDescriptor (org.apache.gobblin.dataset.DatasetDescriptor)8 Test (org.testng.annotations.Test)5 ArrayList (java.util.ArrayList)3 State (org.apache.gobblin.configuration.State)3 Path (org.apache.hadoop.fs.Path)3 Set (java.util.Set)2 Descriptor (org.apache.gobblin.dataset.Descriptor)2 Closer (com.google.common.io.Closer)1 IOException (java.io.IOException)1 URI (java.net.URI)1 WorkUnitState (org.apache.gobblin.configuration.WorkUnitState)1 CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)1 CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)1 PostPublishStep (org.apache.gobblin.data.management.copy.entities.PostPublishStep)1 PrePublishStep (org.apache.gobblin.data.management.copy.entities.PrePublishStep)1 HiveRegisterStep (org.apache.gobblin.hive.HiveRegisterStep)1 HiveSpec (org.apache.gobblin.hive.spec.HiveSpec)1 SimpleHiveSpec (org.apache.gobblin.hive.spec.SimpleHiveSpec)1 GobblinTrackingEvent (org.apache.gobblin.metrics.GobblinTrackingEvent)1