Search in sources :

Example 16 with DataSetRefs

use of org.apache.nifi.atlas.provenance.DataSetRefs in project nifi by apache.

the class CompleteFlowPathLineage method createCompleteFlowPath.

/**
 * Create a new FlowPath from a LineagePath. FlowPaths created by this method will have a hash in its qualified name.
 *
 * <p>This method processes parents first to generate a hash, as parent LineagePath hashes contribute child hash
 * in order to distinguish FlowPaths based on the complete path for a given FlowFile.
 * For example, even if two lineagePaths have identical componentIds/inputs/outputs,
 * if those parents have different inputs, those should be treated as different paths.</p>
 *
 * @param nifiFlow A reference to current NiFiFlow
 * @param lineagePath LineagePath from which NiFiFlowPath and DataSet refs are created and added to the {@code createdFlowPaths}.
 * @param createdFlowPaths A list to buffer created NiFiFlowPaths,
 *                         in order to defer sending notification to Kafka until all parent FlowPath get analyzed.
 */
private void createCompleteFlowPath(NiFiFlow nifiFlow, LineagePath lineagePath, List<Tuple<NiFiFlowPath, DataSetRefs>> createdFlowPaths) {
    final List<ProvenanceEventRecord> events = lineagePath.getEvents();
    Collections.reverse(events);
    final List<String> componentIds = events.stream().map(ProvenanceEventRecord::getComponentId).collect(Collectors.toList());
    final String firstComponentId = events.get(0).getComponentId();
    final DataSetRefs dataSetRefs = lineagePath.getRefs();
    // Process parents first.
    Referenceable queueBetweenParent = null;
    if (!lineagePath.getParents().isEmpty()) {
        // Add queue between this lineage path and parent.
        queueBetweenParent = new Referenceable(TYPE_NIFI_QUEUE);
        // The first event knows why this lineage has parents, e.g. FORK or JOIN.
        final String firstEventType = events.get(0).getEventType().name();
        queueBetweenParent.set(ATTR_NAME, firstEventType);
        dataSetRefs.addInput(queueBetweenParent);
        for (LineagePath parent : lineagePath.getParents()) {
            parent.getRefs().addOutput(queueBetweenParent);
            createCompleteFlowPath(nifiFlow, parent, createdFlowPaths);
        }
    }
    // Create a variant path.
    // Calculate a hash from component_ids and input and output resource ids.
    final Stream<String> ioIds = Stream.concat(dataSetRefs.getInputs().stream(), dataSetRefs.getOutputs().stream()).map(ref -> toTypedQualifiedName(ref.getTypeName(), toStr(ref.get(ATTR_QUALIFIED_NAME))));
    final Stream<String> parentHashes = lineagePath.getParents().stream().map(p -> String.valueOf(p.getLineagePathHash()));
    final CRC32 crc32 = new CRC32();
    crc32.update(Stream.of(componentIds.stream(), ioIds, parentHashes).reduce(Stream::concat).orElseGet(Stream::empty).sorted().distinct().collect(Collectors.joining(",")).getBytes(StandardCharsets.UTF_8));
    final long hash = crc32.getValue();
    lineagePath.setLineagePathHash(hash);
    final NiFiFlowPath flowPath = new NiFiFlowPath(firstComponentId, hash);
    // E.g, FF1 and FF2 read from dirA were merged, vs FF3 and FF4 read from dirB were merged then passed here, these two should be different queue.
    if (queueBetweenParent != null) {
        queueBetweenParent.set(ATTR_QUALIFIED_NAME, toQualifiedName(nifiFlow.getClusterName(), firstComponentId + "::" + hash));
    }
    // If the same components emitted multiple provenance events consecutively, merge it to come up with a simpler name.
    String previousComponentId = null;
    List<ProvenanceEventRecord> uniqueEventsForName = new ArrayList<>();
    for (ProvenanceEventRecord event : events) {
        if (!event.getComponentId().equals(previousComponentId)) {
            uniqueEventsForName.add(event);
        }
        previousComponentId = event.getComponentId();
    }
    final String pathName = uniqueEventsForName.stream().map(event -> nifiFlow.getProcessComponentName(event.getComponentId(), event::getComponentType)).collect(Collectors.joining(", "));
    flowPath.setName(pathName);
    final NiFiFlowPath staticFlowPath = nifiFlow.findPath(firstComponentId);
    flowPath.setGroupId(staticFlowPath != null ? staticFlowPath.getGroupId() : nifiFlow.getRootProcessGroupId());
    // To defer send notification until entire lineagePath analysis gets finished, just add the instance into a buffer.
    createdFlowPaths.add(new Tuple<>(flowPath, dataSetRefs));
}
Also used : ComputeLineageResult(org.apache.nifi.provenance.lineage.ComputeLineageResult) HashMap(java.util.HashMap) ATTR_QUALIFIED_NAME(org.apache.nifi.atlas.NiFiTypes.ATTR_QUALIFIED_NAME) ArrayList(java.util.ArrayList) NiFiFlowPath(org.apache.nifi.atlas.NiFiFlowPath) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) Map(java.util.Map) LineageNode(org.apache.nifi.provenance.lineage.LineageNode) DataSetRefs(org.apache.nifi.atlas.provenance.DataSetRefs) NiFiFlow(org.apache.nifi.atlas.NiFiFlow) ATTR_NAME(org.apache.nifi.atlas.NiFiTypes.ATTR_NAME) AtlasUtils.toStr(org.apache.nifi.atlas.AtlasUtils.toStr) AtlasUtils.toTypedQualifiedName(org.apache.nifi.atlas.AtlasUtils.toTypedQualifiedName) ProvenanceEventType(org.apache.nifi.provenance.ProvenanceEventType) AtlasUtils.toQualifiedName(org.apache.nifi.atlas.AtlasUtils.toQualifiedName) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) AnalysisContext(org.apache.nifi.atlas.provenance.AnalysisContext) Objects(java.util.Objects) List(java.util.List) Stream(java.util.stream.Stream) Tuple(org.apache.nifi.util.Tuple) DROP(org.apache.nifi.provenance.ProvenanceEventType.DROP) TYPE_NIFI_QUEUE(org.apache.nifi.atlas.NiFiTypes.TYPE_NIFI_QUEUE) CRC32(java.util.zip.CRC32) Referenceable(org.apache.atlas.typesystem.Referenceable) Collections(java.util.Collections) LineageNodeType(org.apache.nifi.provenance.lineage.LineageNodeType) CRC32(java.util.zip.CRC32) ArrayList(java.util.ArrayList) Referenceable(org.apache.atlas.typesystem.Referenceable) NiFiFlowPath(org.apache.nifi.atlas.NiFiFlowPath) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) DataSetRefs(org.apache.nifi.atlas.provenance.DataSetRefs) Stream(java.util.stream.Stream)

Example 17 with DataSetRefs

use of org.apache.nifi.atlas.provenance.DataSetRefs in project nifi by apache.

the class SimpleFlowPathLineage method processRemotePortEvent.

/**
 * Create a flow_path entity corresponding to the target RemoteGroupPort when a SEND/RECEIVE event are received.
 * Because such entity can not be created in advance while analyzing flow statically,
 * as ReportingTask can not determine whether a component id is a RemoteGroupPort,
 * since connectionStatus is the only available information in ReportingContext.
 * ConnectionStatus only knows component id, component type is unknown.
 * For example, there is no difference to tell if a connected component is a funnel or a RemoteGroupPort.
 */
private void processRemotePortEvent(AnalysisContext analysisContext, NiFiFlow nifiFlow, ProvenanceEventRecord event, DataSetRefs analyzedRefs) {
    final boolean isRemoteInputPort = "Remote Input Port".equals(event.getComponentType());
    // Create a RemoteInputPort Process.
    // event.getComponentId returns UUID for RemoteGroupPort as a client of S2S, and it's different from a remote port UUID (portDataSetid).
    // See NIFI-4571 for detail.
    final Referenceable remotePortDataSet = isRemoteInputPort ? analyzedRefs.getOutputs().iterator().next() : analyzedRefs.getInputs().iterator().next();
    final String portProcessId = event.getComponentId();
    final NiFiFlowPath remotePortProcess = new NiFiFlowPath(portProcessId);
    remotePortProcess.setName(event.getComponentType());
    remotePortProcess.addProcessor(portProcessId);
    // That is only possible by calling lineage API.
    if (isRemoteInputPort) {
        final ProvenanceEventRecord previousEvent = findPreviousProvenanceEvent(analysisContext, event);
        if (previousEvent == null) {
            logger.warn("Previous event was not found: {}", new Object[] { event });
            return;
        }
        // Set groupId from incoming connection if available.
        final List<ConnectionStatus> incomingConnections = nifiFlow.getIncomingConnections(portProcessId);
        if (incomingConnections == null || incomingConnections.isEmpty()) {
            logger.warn("Incoming relationship was not found: {}", new Object[] { event });
            return;
        }
        final ConnectionStatus connection = incomingConnections.get(0);
        remotePortProcess.setGroupId(connection.getGroupId());
        final Referenceable remotePortProcessRef = toReferenceable(remotePortProcess, nifiFlow);
        createEntity(remotePortProcessRef);
        // Create a queue.
        Referenceable queueFromStaticFlowPathToRemotePortProcess = new Referenceable(TYPE_NIFI_QUEUE);
        queueFromStaticFlowPathToRemotePortProcess.set(ATTR_NAME, "queue");
        queueFromStaticFlowPathToRemotePortProcess.set(ATTR_QUALIFIED_NAME, nifiFlow.toQualifiedName(portProcessId));
        // Create lineage: Static flow_path -> queue
        DataSetRefs staticFlowPathRefs = new DataSetRefs(previousEvent.getComponentId());
        staticFlowPathRefs.addOutput(queueFromStaticFlowPathToRemotePortProcess);
        addDataSetRefs(nifiFlow, staticFlowPathRefs);
        // Create lineage: Queue -> RemoteInputPort process -> RemoteInputPort dataSet
        DataSetRefs remotePortRefs = new DataSetRefs(portProcessId);
        remotePortRefs.addInput(queueFromStaticFlowPathToRemotePortProcess);
        remotePortRefs.addOutput(remotePortDataSet);
        addDataSetRefs(remotePortRefs, remotePortProcessRef);
    } else {
        // For RemoteOutputPort, it's possible that multiple processors are connected.
        // In that case, the received FlowFile is cloned and passed to each connection.
        // So we need to create multiple DataSetRefs.
        final List<ConnectionStatus> connections = nifiFlow.getOutgoingConnections(portProcessId);
        if (connections == null || connections.isEmpty()) {
            logger.warn("Incoming connection was not found: {}", new Object[] { event });
            return;
        }
        // Set group id from outgoing connection if available.
        remotePortProcess.setGroupId(connections.get(0).getGroupId());
        final Referenceable remotePortProcessRef = toReferenceable(remotePortProcess, nifiFlow);
        createEntity(remotePortProcessRef);
        // Create lineage: RemoteOutputPort dataSet -> RemoteOutputPort process
        DataSetRefs remotePortRefs = new DataSetRefs(portProcessId);
        remotePortRefs.addInput(remotePortDataSet);
        addDataSetRefs(remotePortRefs, remotePortProcessRef);
        for (ConnectionStatus connection : connections) {
            final String destinationId = connection.getDestinationId();
            final NiFiFlowPath destFlowPath = nifiFlow.findPath(destinationId);
            if (destFlowPath == null) {
                // as a queue will be created by the connected RemoteInputPort to connect this RemoteOutputPort.
                continue;
            }
            // Create a queue.
            Referenceable queueFromRemotePortProcessToStaticFlowPath = new Referenceable(TYPE_NIFI_QUEUE);
            queueFromRemotePortProcessToStaticFlowPath.set(ATTR_NAME, "queue");
            queueFromRemotePortProcessToStaticFlowPath.set(ATTR_QUALIFIED_NAME, nifiFlow.toQualifiedName(destinationId));
            // Create lineage: Queue -> Static flow_path
            DataSetRefs staticFlowPathRefs = new DataSetRefs(destinationId);
            staticFlowPathRefs.addInput(queueFromRemotePortProcessToStaticFlowPath);
            addDataSetRefs(nifiFlow, staticFlowPathRefs);
            // Create lineage: RemoteOutputPort dataSet -> RemoteOutputPort process -> Queue
            remotePortRefs.addOutput(queueFromRemotePortProcessToStaticFlowPath);
            addDataSetRefs(remotePortRefs, remotePortProcessRef);
        }
        // Add RemoteOutputPort process, so that it can be found even if it is connected to RemoteInputPort directory without any processor in between.
        nifiFlow.getFlowPaths().put(remotePortProcess.getId(), remotePortProcess);
    }
}
Also used : Referenceable(org.apache.atlas.typesystem.Referenceable) NiFiFlowPath(org.apache.nifi.atlas.NiFiFlowPath) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) DataSetRefs(org.apache.nifi.atlas.provenance.DataSetRefs) ConnectionStatus(org.apache.nifi.controller.status.ConnectionStatus)

Example 18 with DataSetRefs

use of org.apache.nifi.atlas.provenance.DataSetRefs in project nifi by apache.

the class TestHBaseTable method testHBaseTable.

@Test
public void testHBaseTable() {
    final String processorName = "FetchHBaseRow";
    final String transitUri = "hbase://0.example.com/tableA/rowB";
    final ProvenanceEventRecord record = Mockito.mock(ProvenanceEventRecord.class);
    when(record.getComponentType()).thenReturn(processorName);
    when(record.getTransitUri()).thenReturn(transitUri);
    when(record.getEventType()).thenReturn(ProvenanceEventType.FETCH);
    final ClusterResolvers clusterResolvers = Mockito.mock(ClusterResolvers.class);
    when(clusterResolvers.fromHostNames(matches(".+\\.example\\.com"))).thenReturn("cluster1");
    final AnalysisContext context = Mockito.mock(AnalysisContext.class);
    when(context.getClusterResolver()).thenReturn(clusterResolvers);
    final NiFiProvenanceEventAnalyzer analyzer = NiFiProvenanceEventAnalyzerFactory.getAnalyzer(processorName, transitUri, record.getEventType());
    assertNotNull(analyzer);
    final DataSetRefs refs = analyzer.analyze(context, record);
    assertEquals(1, refs.getInputs().size());
    assertEquals(0, refs.getOutputs().size());
    Referenceable ref = refs.getInputs().iterator().next();
    assertEquals("hbase_table", ref.getTypeName());
    assertEquals("tableA", ref.get(ATTR_NAME));
    assertEquals("tableA@cluster1", ref.get(ATTR_QUALIFIED_NAME));
}
Also used : Referenceable(org.apache.atlas.typesystem.Referenceable) ClusterResolvers(org.apache.nifi.atlas.resolver.ClusterResolvers) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) DataSetRefs(org.apache.nifi.atlas.provenance.DataSetRefs) AnalysisContext(org.apache.nifi.atlas.provenance.AnalysisContext) NiFiProvenanceEventAnalyzer(org.apache.nifi.atlas.provenance.NiFiProvenanceEventAnalyzer) Test(org.junit.Test)

Example 19 with DataSetRefs

use of org.apache.nifi.atlas.provenance.DataSetRefs in project nifi by apache.

the class TestHDFSPath method testHDFSPath.

@Test
public void testHDFSPath() {
    final String processorName = "PutHDFS";
    // TODO: what if with HA namenode?
    final String transitUri = "hdfs://0.example.com:8020/user/nifi/fileA";
    final ProvenanceEventRecord record = Mockito.mock(ProvenanceEventRecord.class);
    when(record.getComponentType()).thenReturn(processorName);
    when(record.getTransitUri()).thenReturn(transitUri);
    when(record.getEventType()).thenReturn(ProvenanceEventType.SEND);
    final ClusterResolvers clusterResolvers = Mockito.mock(ClusterResolvers.class);
    when(clusterResolvers.fromHostNames(matches(".+\\.example\\.com"))).thenReturn("cluster1");
    final AnalysisContext context = Mockito.mock(AnalysisContext.class);
    when(context.getClusterResolver()).thenReturn(clusterResolvers);
    final NiFiProvenanceEventAnalyzer analyzer = NiFiProvenanceEventAnalyzerFactory.getAnalyzer(processorName, transitUri, record.getEventType());
    assertNotNull(analyzer);
    final DataSetRefs refs = analyzer.analyze(context, record);
    assertEquals(0, refs.getInputs().size());
    assertEquals(1, refs.getOutputs().size());
    Referenceable ref = refs.getOutputs().iterator().next();
    assertEquals("hdfs_path", ref.getTypeName());
    assertEquals("/user/nifi/fileA", ref.get(ATTR_NAME));
    assertEquals("/user/nifi/fileA@cluster1", ref.get(ATTR_QUALIFIED_NAME));
}
Also used : Referenceable(org.apache.atlas.typesystem.Referenceable) ClusterResolvers(org.apache.nifi.atlas.resolver.ClusterResolvers) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) DataSetRefs(org.apache.nifi.atlas.provenance.DataSetRefs) AnalysisContext(org.apache.nifi.atlas.provenance.AnalysisContext) NiFiProvenanceEventAnalyzer(org.apache.nifi.atlas.provenance.NiFiProvenanceEventAnalyzer) Test(org.junit.Test)

Example 20 with DataSetRefs

use of org.apache.nifi.atlas.provenance.DataSetRefs in project nifi by apache.

the class TestHive2JDBC method testTableLineage.

/**
 * If a provenance event has table name attributes,
 * then table lineages can be created.
 */
@Test
public void testTableLineage() {
    final String processorName = "PutHiveQL";
    final String transitUri = "jdbc:hive2://0.example.com:10000/databaseA";
    final ProvenanceEventRecord record = Mockito.mock(ProvenanceEventRecord.class);
    when(record.getComponentType()).thenReturn(processorName);
    when(record.getTransitUri()).thenReturn(transitUri);
    when(record.getEventType()).thenReturn(ProvenanceEventType.SEND);
    // E.g. insert into databaseB.tableB1 select something from tableA1 a1 inner join tableA2 a2 where a1.id = a2.id
    when(record.getAttribute(ATTR_INPUT_TABLES)).thenReturn("tableA1, tableA2");
    when(record.getAttribute(ATTR_OUTPUT_TABLES)).thenReturn("databaseB.tableB1");
    final ClusterResolvers clusterResolvers = Mockito.mock(ClusterResolvers.class);
    when(clusterResolvers.fromHostNames(matches(".+\\.example\\.com"))).thenReturn("cluster1");
    final AnalysisContext context = Mockito.mock(AnalysisContext.class);
    when(context.getClusterResolver()).thenReturn(clusterResolvers);
    final NiFiProvenanceEventAnalyzer analyzer = NiFiProvenanceEventAnalyzerFactory.getAnalyzer(processorName, transitUri, record.getEventType());
    assertNotNull(analyzer);
    final DataSetRefs refs = analyzer.analyze(context, record);
    assertEquals(2, refs.getInputs().size());
    // QualifiedName : Name
    final Map<String, String> expectedInputRefs = new HashMap<>();
    expectedInputRefs.put("databaseA.tableA1@cluster1", "tableA1");
    expectedInputRefs.put("databaseA.tableA2@cluster1", "tableA2");
    for (Referenceable ref : refs.getInputs()) {
        final String qName = (String) ref.get(ATTR_QUALIFIED_NAME);
        assertTrue(expectedInputRefs.containsKey(qName));
        assertEquals(expectedInputRefs.get(qName), ref.get(ATTR_NAME));
    }
    assertEquals(1, refs.getOutputs().size());
    Referenceable ref = refs.getOutputs().iterator().next();
    assertEquals("hive_table", ref.getTypeName());
    assertEquals("tableB1", ref.get(ATTR_NAME));
    assertEquals("databaseB.tableB1@cluster1", ref.get(ATTR_QUALIFIED_NAME));
}
Also used : Referenceable(org.apache.atlas.typesystem.Referenceable) ClusterResolvers(org.apache.nifi.atlas.resolver.ClusterResolvers) HashMap(java.util.HashMap) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) DataSetRefs(org.apache.nifi.atlas.provenance.DataSetRefs) AnalysisContext(org.apache.nifi.atlas.provenance.AnalysisContext) NiFiProvenanceEventAnalyzer(org.apache.nifi.atlas.provenance.NiFiProvenanceEventAnalyzer) Test(org.junit.Test)

Aggregations

DataSetRefs (org.apache.nifi.atlas.provenance.DataSetRefs)26 Referenceable (org.apache.atlas.typesystem.Referenceable)22 ProvenanceEventRecord (org.apache.nifi.provenance.ProvenanceEventRecord)21 AnalysisContext (org.apache.nifi.atlas.provenance.AnalysisContext)19 NiFiProvenanceEventAnalyzer (org.apache.nifi.atlas.provenance.NiFiProvenanceEventAnalyzer)18 ClusterResolvers (org.apache.nifi.atlas.resolver.ClusterResolvers)18 Test (org.junit.Test)18 HashMap (java.util.HashMap)7 ArrayList (java.util.ArrayList)6 ConnectionStatus (org.apache.nifi.controller.status.ConnectionStatus)5 Tuple (org.apache.nifi.util.Tuple)4 NiFiFlowPath (org.apache.nifi.atlas.NiFiFlowPath)3 List (java.util.List)2 ComputeLineageResult (org.apache.nifi.provenance.lineage.ComputeLineageResult)2 URI (java.net.URI)1 StandardCharsets (java.nio.charset.StandardCharsets)1 Collections (java.util.Collections)1 Map (java.util.Map)1 Objects (java.util.Objects)1 Matcher (java.util.regex.Matcher)1