Search in sources :

Example 1 with NamespacedEntityId

use of io.cdap.cdap.proto.id.NamespacedEntityId in project cdap by caskdata.

the class MetadataSubscriberServiceTest method testSubscriber.

@Test
public void testSubscriber() throws InterruptedException, ExecutionException, TimeoutException {
    LineageStoreReader lineageReader = getInjector().getInstance(LineageStoreReader.class);
    ProgramRunId run1 = service1.run(RunIds.generate());
    // Try to read lineage, which should be empty since we haven't start the MetadataSubscriberService yet.
    Set<NamespacedEntityId> entities = lineageReader.getEntitiesForRun(run1);
    Assert.assertTrue(entities.isEmpty());
    // Write out some lineage information
    LineageWriter lineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
    lineageWriter.addAccess(run1, dataset1, AccessType.READ);
    lineageWriter.addAccess(run1, dataset2, AccessType.WRITE);
    // Write the field level lineage
    FieldLineageWriter fieldLineageWriter = getInjector().getInstance(MessagingLineageWriter.class);
    ProgramRunId spark1Run1 = spark1.run(RunIds.generate(100));
    ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns", "endpoint1"), "offset", "body");
    TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");
    WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
    List<Operation> operations = new ArrayList<>();
    operations.add(read);
    operations.add(write);
    operations.add(parse);
    FieldLineageInfo info1 = new FieldLineageInfo(operations);
    fieldLineageWriter.write(spark1Run1, info1);
    ProgramRunId spark1Run2 = spark1.run(RunIds.generate(200));
    fieldLineageWriter.write(spark1Run2, info1);
    List<Operation> operations2 = new ArrayList<>();
    operations2.add(read);
    operations2.add(parse);
    TransformOperation normalize = new TransformOperation("normalize", "normalize address", Collections.singletonList(InputField.of("parse", "address")), "address");
    operations2.add(normalize);
    WriteOperation anotherWrite = new WriteOperation("anotherwrite", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("normalize", "address")));
    operations2.add(anotherWrite);
    FieldLineageInfo info2 = new FieldLineageInfo(operations2);
    ProgramRunId spark1Run3 = spark1.run(RunIds.generate(300));
    fieldLineageWriter.write(spark1Run3, info2);
    // Emit some usages
    UsageWriter usageWriter = getInjector().getInstance(MessagingUsageWriter.class);
    usageWriter.register(spark1, dataset1);
    usageWriter.registerAll(Collections.singleton(spark1), dataset3);
    // Verifies lineage has been written
    Set<NamespacedEntityId> expectedLineage = new HashSet<>(Arrays.asList(run1.getParent(), dataset1, dataset2));
    Tasks.waitFor(true, () -> expectedLineage.equals(lineageReader.getEntitiesForRun(run1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    // There shouldn't be any lineage for the "spark1" program, as only usage has been emitted.
    Assert.assertTrue(lineageReader.getRelations(spark1, 0L, Long.MAX_VALUE, x -> true).isEmpty());
    FieldLineageReader fieldLineageReader = getInjector().getInstance(FieldLineageReader.class);
    Set<Operation> expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    expectedOperations.add(anotherWrite);
    List<ProgramRunOperations> expected = new ArrayList<>();
    // Descending order of program execution
    expected.add(new ProgramRunOperations(Collections.singleton(spark1Run3), expectedOperations));
    expectedOperations = new HashSet<>();
    expectedOperations.add(read);
    expectedOperations.add(write);
    expected.add(new ProgramRunOperations(new HashSet<>(Arrays.asList(spark1Run1, spark1Run2)), expectedOperations));
    EndPointField endPointField = new EndPointField(EndPoint.of("ns", "endpoint2"), "offset");
    Tasks.waitFor(expected, () -> fieldLineageReader.getIncomingOperations(endPointField, 1L, Long.MAX_VALUE - 1), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
    // Verifies usage has been written
    Set<EntityId> expectedUsage = new HashSet<>(Arrays.asList(dataset1, dataset3));
    UsageRegistry usageRegistry = getInjector().getInstance(UsageRegistry.class);
    Tasks.waitFor(true, () -> expectedUsage.equals(usageRegistry.getDatasets(spark1)), 10, TimeUnit.SECONDS, 100, TimeUnit.MILLISECONDS);
}
Also used : ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) ProgramRunOperations(io.cdap.cdap.proto.metadata.lineage.ProgramRunOperations) UsageWriter(io.cdap.cdap.data2.registry.UsageWriter) MessagingUsageWriter(io.cdap.cdap.data2.registry.MessagingUsageWriter) FieldLineageReader(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageReader) EndPointField(io.cdap.cdap.data2.metadata.lineage.field.EndPointField) UsageRegistry(io.cdap.cdap.data2.registry.UsageRegistry) ArrayList(java.util.ArrayList) ReadOperation(io.cdap.cdap.api.lineage.field.ReadOperation) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MetadataOperation(io.cdap.cdap.data2.metadata.writer.MetadataOperation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) Operation(io.cdap.cdap.api.lineage.field.Operation) TransformOperation(io.cdap.cdap.api.lineage.field.TransformOperation) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) EntityId(io.cdap.cdap.proto.id.EntityId) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) WriteOperation(io.cdap.cdap.api.lineage.field.WriteOperation) MessagingLineageWriter(io.cdap.cdap.data2.metadata.writer.MessagingLineageWriter) FieldLineageWriter(io.cdap.cdap.data2.metadata.writer.FieldLineageWriter) LineageWriter(io.cdap.cdap.data2.metadata.writer.LineageWriter) LineageStoreReader(io.cdap.cdap.data2.metadata.lineage.LineageStoreReader) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) FieldLineageInfo(io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo) FieldLineageWriter(io.cdap.cdap.data2.metadata.writer.FieldLineageWriter) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 2 with NamespacedEntityId

use of io.cdap.cdap.proto.id.NamespacedEntityId in project cdap by caskdata.

the class MetadataHttpHandlerTestRun method testSearchMetadata.

@Test
public void testSearchMetadata() throws Exception {
    appClient.deploy(NamespaceId.DEFAULT, createAppJarFile(AllProgramsApp.class));
    // wait for the system metadata to be processed
    ApplicationId appId = NamespaceId.DEFAULT.app(AllProgramsApp.NAME);
    DatasetId datasetId = NamespaceId.DEFAULT.dataset(AllProgramsApp.DATASET_NAME);
    Tasks.waitFor(false, () -> getProperties(appId, MetadataScope.SYSTEM).isEmpty(), 10, TimeUnit.SECONDS);
    Tasks.waitFor(false, () -> getProperties(datasetId, MetadataScope.SYSTEM).isEmpty(), 10, TimeUnit.SECONDS);
    Map<NamespacedEntityId, Metadata> expectedUserMetadata = new HashMap<>();
    // Add metadata to app
    Map<String, String> props = ImmutableMap.of("key1", "value1");
    Set<String> tags = ImmutableSet.of("tag1", "tag2");
    addProperties(appId, props);
    addTags(appId, tags);
    expectedUserMetadata.put(appId, new Metadata(props, tags));
    // Add metadata to dataset
    props = ImmutableMap.of("key10", "value10", "key11", "value11");
    tags = ImmutableSet.of("tag11");
    addProperties(datasetId, props);
    addTags(datasetId, tags);
    expectedUserMetadata.put(datasetId, new Metadata(props, tags));
    Set<MetadataSearchResultRecord> results = searchMetadata(NamespaceId.DEFAULT, "value*").getResults();
    // Verify results
    Assert.assertEquals(expectedUserMetadata.keySet(), extractEntityIds(results));
    for (MetadataSearchResultRecord result : results) {
        // User metadata has to match exactly since we know what we have set
        Assert.assertEquals(expectedUserMetadata.get(result.getEntityId()), result.getMetadata().get(MetadataScope.USER));
        // Make sure system metadata is returned, we cannot check for exact match since we haven't set it
        Metadata systemMetadata = result.getMetadata().get(MetadataScope.SYSTEM);
        Assert.assertNotNull(systemMetadata);
        Assert.assertFalse(systemMetadata.getProperties().isEmpty());
        Assert.assertFalse(systemMetadata.getTags().isEmpty());
    }
    // add metadata to field (custom entity)
    props = ImmutableMap.of("fKey1", "fValue1", "fKey2", "fValue2");
    tags = ImmutableSet.of("fTag1");
    MetadataEntity metadataEntity = MetadataEntity.builder(datasetId.toMetadataEntity()).appendAsType("field", "someField").build();
    addProperties(metadataEntity, props);
    addTags(metadataEntity, tags);
    Map<MetadataEntity, Metadata> expectedUserMetadataV2 = new HashMap<>();
    expectedUserMetadataV2.put(metadataEntity, new Metadata(props, tags));
    Set<MetadataSearchResultRecord> resultsV2 = super.searchMetadata(ImmutableList.of(NamespaceId.DEFAULT), "fValue*", ImmutableSet.of(), null, 0, Integer.MAX_VALUE, 0, null, false).getResults();
    // Verify results
    Assert.assertEquals(expectedUserMetadataV2.keySet(), ImmutableSet.copyOf(extractMetadataEntities(resultsV2)));
    for (MetadataSearchResultRecord result : resultsV2) {
        // User metadata has to match exactly since we know what we have set
        Assert.assertEquals(expectedUserMetadataV2.get(result.getMetadataEntity()), result.getMetadata().get(MetadataScope.USER));
        // Make sure system metadata is returned, we cannot check for exact match since we haven't set it
        Metadata systemMetadata = result.getMetadata().get(MetadataScope.SYSTEM);
        // custom entity should not have any system metadata for it
        Assert.assertNull(systemMetadata);
    }
}
Also used : MetadataEntity(io.cdap.cdap.api.metadata.MetadataEntity) HashMap(java.util.HashMap) Metadata(io.cdap.cdap.api.metadata.Metadata) AllProgramsApp(io.cdap.cdap.client.app.AllProgramsApp) DatasetId(io.cdap.cdap.proto.id.DatasetId) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) MetadataSearchResultRecord(io.cdap.cdap.proto.metadata.MetadataSearchResultRecord) ApplicationId(io.cdap.cdap.proto.id.ApplicationId) Test(org.junit.Test)

Example 3 with NamespacedEntityId

use of io.cdap.cdap.proto.id.NamespacedEntityId in project cdap by caskdata.

the class LineageWriterDatasetFramework method doWriteLineage.

private void doWriteLineage(DatasetId datasetInstanceId, AccessType accessType) {
    ProgramContext programContext = this.programContext;
    if (programContext != null) {
        ProgramRunId programRunId = programContext.getProgramRunId();
        NamespacedEntityId componentId = programContext.getComponentId();
        try {
            lineageWriter.addAccess(programRunId, datasetInstanceId, accessType, componentId);
        } catch (Throwable t) {
            // Failure to write to lineage shouldn't cause dataset operation failure
            LOG.warn("Failed to write lineage information for dataset {} with access type {} from {},{}", datasetInstanceId, accessType, programRunId, componentId);
            // Log the stacktrace as debug to not polluting the log
            LOG.debug("Cause for lineage writing failure for {} {} {} {}", datasetInstanceId, accessType, programRunId, componentId, t);
        }
    }
}
Also used : NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) ProgramRunId(io.cdap.cdap.proto.id.ProgramRunId) ProgramContext(io.cdap.cdap.data.ProgramContext)

Example 4 with NamespacedEntityId

use of io.cdap.cdap.proto.id.NamespacedEntityId in project cdap by caskdata.

the class LineageTable method getEntitiesForRun.

/**
 * @return a set of entities (program and data it accesses) associated with a program run.
 */
public Set<NamespacedEntityId> getEntitiesForRun(ProgramRunId run) throws IOException {
    ImmutableSet.Builder<NamespacedEntityId> builder = ImmutableSet.builder();
    List<Field<?>> prefix = getRunScanStartKey(run);
    try (CloseableIterator<StructuredRow> iterator = getProgramTable().scan(Range.singleton(prefix), Integer.MAX_VALUE)) {
        while (iterator.hasNext()) {
            StructuredRow row = iterator.next();
            if (run.getRun().equals(row.getString(StoreDefinition.LineageStore.RUN_FIELD))) {
                builder.add(getProgramFromRow(row));
                builder.add(getDatasetFromRow(row));
            }
        }
    }
    return builder.build();
}
Also used : Field(io.cdap.cdap.spi.data.table.field.Field) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) ImmutableSet(com.google.common.collect.ImmutableSet) StructuredRow(io.cdap.cdap.spi.data.StructuredRow)

Example 5 with NamespacedEntityId

use of io.cdap.cdap.proto.id.NamespacedEntityId in project cdap by caskdata.

the class LineageCollapser method collapseRelations.

/**
 * Collapse {@link Relation}s based on {@link CollapseType}
 * @param relations lineage relations
 * @param collapseTypes fields to collapse relations on
 * @return collapsed relations
 */
public static Set<CollapsedRelation> collapseRelations(Iterable<Relation> relations, Set<CollapseType> collapseTypes) {
    Set<CollapsedRelation> collapsedRelations = new HashSet<>();
    Multimap<CollapseKey, Relation> multimap = HashMultimap.create();
    for (Relation relation : relations) {
        multimap.put(getCollapseKey(relation, collapseTypes), relation);
    }
    LOG.trace("Collapsed relations: {}", multimap.asMap());
    for (Map.Entry<CollapseKey, Collection<Relation>> collapsedEntry : multimap.asMap().entrySet()) {
        NamespacedEntityId data = collapsedEntry.getKey().data;
        ProgramId program = collapsedEntry.getKey().program;
        Set<AccessType> accessTypes = new HashSet<>();
        Set<RunId> runs = new HashSet<>();
        Set<NamespacedEntityId> components = new HashSet<>();
        for (Relation relation : collapsedEntry.getValue()) {
            accessTypes.add(relation.getAccess());
            runs.add(relation.getRun());
            components.addAll(relation.getComponents());
        }
        collapsedRelations.add(toCollapsedRelation(data, program, accessTypes, runs, components));
    }
    return collapsedRelations;
}
Also used : ProgramId(io.cdap.cdap.proto.id.ProgramId) NamespacedEntityId(io.cdap.cdap.proto.id.NamespacedEntityId) Collection(java.util.Collection) RunId(org.apache.twill.api.RunId) Map(java.util.Map) HashSet(java.util.HashSet)

Aggregations

NamespacedEntityId (io.cdap.cdap.proto.id.NamespacedEntityId)10 Test (org.junit.Test)5 HashSet (java.util.HashSet)4 ApplicationId (io.cdap.cdap.proto.id.ApplicationId)3 EntityId (io.cdap.cdap.proto.id.EntityId)3 ProgramRunId (io.cdap.cdap.proto.id.ProgramRunId)3 ArrayList (java.util.ArrayList)3 ApplicationSpecification (io.cdap.cdap.api.app.ApplicationSpecification)2 Operation (io.cdap.cdap.api.lineage.field.Operation)2 ReadOperation (io.cdap.cdap.api.lineage.field.ReadOperation)2 TransformOperation (io.cdap.cdap.api.lineage.field.TransformOperation)2 WriteOperation (io.cdap.cdap.api.lineage.field.WriteOperation)2 MetadataEntity (io.cdap.cdap.api.metadata.MetadataEntity)2 LineageStoreReader (io.cdap.cdap.data2.metadata.lineage.LineageStoreReader)2 EndPointField (io.cdap.cdap.data2.metadata.lineage.field.EndPointField)2 FieldLineageInfo (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageInfo)2 FieldLineageReader (io.cdap.cdap.data2.metadata.lineage.field.FieldLineageReader)2 FieldLineageWriter (io.cdap.cdap.data2.metadata.writer.FieldLineageWriter)2 LineageWriter (io.cdap.cdap.data2.metadata.writer.LineageWriter)2 MessagingLineageWriter (io.cdap.cdap.data2.metadata.writer.MessagingLineageWriter)2