use of org.apache.nifi.atlas.provenance.DataSetRefs in project nifi by apache.
the class CompleteFlowPathLineage method createCompleteFlowPath.
/**
* Create a new FlowPath from a LineagePath. FlowPaths created by this method will have a hash in its qualified name.
*
* <p>This method processes parents first to generate a hash, as parent LineagePath hashes contribute child hash
* in order to distinguish FlowPaths based on the complete path for a given FlowFile.
* For example, even if two lineagePaths have identical componentIds/inputs/outputs,
* if those parents have different inputs, those should be treated as different paths.</p>
*
* @param nifiFlow A reference to current NiFiFlow
* @param lineagePath LineagePath from which NiFiFlowPath and DataSet refs are created and added to the {@code createdFlowPaths}.
* @param createdFlowPaths A list to buffer created NiFiFlowPaths,
* in order to defer sending notification to Kafka until all parent FlowPath get analyzed.
*/
private void createCompleteFlowPath(NiFiFlow nifiFlow, LineagePath lineagePath, List<Tuple<NiFiFlowPath, DataSetRefs>> createdFlowPaths) {
final List<ProvenanceEventRecord> events = lineagePath.getEvents();
Collections.reverse(events);
final List<String> componentIds = events.stream().map(ProvenanceEventRecord::getComponentId).collect(Collectors.toList());
final String firstComponentId = events.get(0).getComponentId();
final DataSetRefs dataSetRefs = lineagePath.getRefs();
// Process parents first.
Referenceable queueBetweenParent = null;
if (!lineagePath.getParents().isEmpty()) {
// Add queue between this lineage path and parent.
queueBetweenParent = new Referenceable(TYPE_NIFI_QUEUE);
// The first event knows why this lineage has parents, e.g. FORK or JOIN.
final String firstEventType = events.get(0).getEventType().name();
queueBetweenParent.set(ATTR_NAME, firstEventType);
dataSetRefs.addInput(queueBetweenParent);
for (LineagePath parent : lineagePath.getParents()) {
parent.getRefs().addOutput(queueBetweenParent);
createCompleteFlowPath(nifiFlow, parent, createdFlowPaths);
}
}
// Create a variant path.
// Calculate a hash from component_ids and input and output resource ids.
final Stream<String> ioIds = Stream.concat(dataSetRefs.getInputs().stream(), dataSetRefs.getOutputs().stream()).map(ref -> toTypedQualifiedName(ref.getTypeName(), toStr(ref.get(ATTR_QUALIFIED_NAME))));
final Stream<String> parentHashes = lineagePath.getParents().stream().map(p -> String.valueOf(p.getLineagePathHash()));
final CRC32 crc32 = new CRC32();
crc32.update(Stream.of(componentIds.stream(), ioIds, parentHashes).reduce(Stream::concat).orElseGet(Stream::empty).sorted().distinct().collect(Collectors.joining(",")).getBytes(StandardCharsets.UTF_8));
final long hash = crc32.getValue();
lineagePath.setLineagePathHash(hash);
final NiFiFlowPath flowPath = new NiFiFlowPath(firstComponentId, hash);
// E.g, FF1 and FF2 read from dirA were merged, vs FF3 and FF4 read from dirB were merged then passed here, these two should be different queue.
if (queueBetweenParent != null) {
queueBetweenParent.set(ATTR_QUALIFIED_NAME, toQualifiedName(nifiFlow.getClusterName(), firstComponentId + "::" + hash));
}
// If the same components emitted multiple provenance events consecutively, merge it to come up with a simpler name.
String previousComponentId = null;
List<ProvenanceEventRecord> uniqueEventsForName = new ArrayList<>();
for (ProvenanceEventRecord event : events) {
if (!event.getComponentId().equals(previousComponentId)) {
uniqueEventsForName.add(event);
}
previousComponentId = event.getComponentId();
}
final String pathName = uniqueEventsForName.stream().map(event -> nifiFlow.getProcessComponentName(event.getComponentId(), event::getComponentType)).collect(Collectors.joining(", "));
flowPath.setName(pathName);
final NiFiFlowPath staticFlowPath = nifiFlow.findPath(firstComponentId);
flowPath.setGroupId(staticFlowPath != null ? staticFlowPath.getGroupId() : nifiFlow.getRootProcessGroupId());
// To defer send notification until entire lineagePath analysis gets finished, just add the instance into a buffer.
createdFlowPaths.add(new Tuple<>(flowPath, dataSetRefs));
}
use of org.apache.nifi.atlas.provenance.DataSetRefs in project nifi by apache.
the class SimpleFlowPathLineage method processRemotePortEvent.
/**
* Create a flow_path entity corresponding to the target RemoteGroupPort when a SEND/RECEIVE event are received.
* Because such entity can not be created in advance while analyzing flow statically,
* as ReportingTask can not determine whether a component id is a RemoteGroupPort,
* since connectionStatus is the only available information in ReportingContext.
* ConnectionStatus only knows component id, component type is unknown.
* For example, there is no difference to tell if a connected component is a funnel or a RemoteGroupPort.
*/
private void processRemotePortEvent(AnalysisContext analysisContext, NiFiFlow nifiFlow, ProvenanceEventRecord event, DataSetRefs analyzedRefs) {
final boolean isRemoteInputPort = "Remote Input Port".equals(event.getComponentType());
// Create a RemoteInputPort Process.
// event.getComponentId returns UUID for RemoteGroupPort as a client of S2S, and it's different from a remote port UUID (portDataSetid).
// See NIFI-4571 for detail.
final Referenceable remotePortDataSet = isRemoteInputPort ? analyzedRefs.getOutputs().iterator().next() : analyzedRefs.getInputs().iterator().next();
final String portProcessId = event.getComponentId();
final NiFiFlowPath remotePortProcess = new NiFiFlowPath(portProcessId);
remotePortProcess.setName(event.getComponentType());
remotePortProcess.addProcessor(portProcessId);
// That is only possible by calling lineage API.
if (isRemoteInputPort) {
final ProvenanceEventRecord previousEvent = findPreviousProvenanceEvent(analysisContext, event);
if (previousEvent == null) {
logger.warn("Previous event was not found: {}", new Object[] { event });
return;
}
// Set groupId from incoming connection if available.
final List<ConnectionStatus> incomingConnections = nifiFlow.getIncomingConnections(portProcessId);
if (incomingConnections == null || incomingConnections.isEmpty()) {
logger.warn("Incoming relationship was not found: {}", new Object[] { event });
return;
}
final ConnectionStatus connection = incomingConnections.get(0);
remotePortProcess.setGroupId(connection.getGroupId());
final Referenceable remotePortProcessRef = toReferenceable(remotePortProcess, nifiFlow);
createEntity(remotePortProcessRef);
// Create a queue.
Referenceable queueFromStaticFlowPathToRemotePortProcess = new Referenceable(TYPE_NIFI_QUEUE);
queueFromStaticFlowPathToRemotePortProcess.set(ATTR_NAME, "queue");
queueFromStaticFlowPathToRemotePortProcess.set(ATTR_QUALIFIED_NAME, nifiFlow.toQualifiedName(portProcessId));
// Create lineage: Static flow_path -> queue
DataSetRefs staticFlowPathRefs = new DataSetRefs(previousEvent.getComponentId());
staticFlowPathRefs.addOutput(queueFromStaticFlowPathToRemotePortProcess);
addDataSetRefs(nifiFlow, staticFlowPathRefs);
// Create lineage: Queue -> RemoteInputPort process -> RemoteInputPort dataSet
DataSetRefs remotePortRefs = new DataSetRefs(portProcessId);
remotePortRefs.addInput(queueFromStaticFlowPathToRemotePortProcess);
remotePortRefs.addOutput(remotePortDataSet);
addDataSetRefs(remotePortRefs, remotePortProcessRef);
} else {
// For RemoteOutputPort, it's possible that multiple processors are connected.
// In that case, the received FlowFile is cloned and passed to each connection.
// So we need to create multiple DataSetRefs.
final List<ConnectionStatus> connections = nifiFlow.getOutgoingConnections(portProcessId);
if (connections == null || connections.isEmpty()) {
logger.warn("Incoming connection was not found: {}", new Object[] { event });
return;
}
// Set group id from outgoing connection if available.
remotePortProcess.setGroupId(connections.get(0).getGroupId());
final Referenceable remotePortProcessRef = toReferenceable(remotePortProcess, nifiFlow);
createEntity(remotePortProcessRef);
// Create lineage: RemoteOutputPort dataSet -> RemoteOutputPort process
DataSetRefs remotePortRefs = new DataSetRefs(portProcessId);
remotePortRefs.addInput(remotePortDataSet);
addDataSetRefs(remotePortRefs, remotePortProcessRef);
for (ConnectionStatus connection : connections) {
final String destinationId = connection.getDestinationId();
final NiFiFlowPath destFlowPath = nifiFlow.findPath(destinationId);
if (destFlowPath == null) {
// as a queue will be created by the connected RemoteInputPort to connect this RemoteOutputPort.
continue;
}
// Create a queue.
Referenceable queueFromRemotePortProcessToStaticFlowPath = new Referenceable(TYPE_NIFI_QUEUE);
queueFromRemotePortProcessToStaticFlowPath.set(ATTR_NAME, "queue");
queueFromRemotePortProcessToStaticFlowPath.set(ATTR_QUALIFIED_NAME, nifiFlow.toQualifiedName(destinationId));
// Create lineage: Queue -> Static flow_path
DataSetRefs staticFlowPathRefs = new DataSetRefs(destinationId);
staticFlowPathRefs.addInput(queueFromRemotePortProcessToStaticFlowPath);
addDataSetRefs(nifiFlow, staticFlowPathRefs);
// Create lineage: RemoteOutputPort dataSet -> RemoteOutputPort process -> Queue
remotePortRefs.addOutput(queueFromRemotePortProcessToStaticFlowPath);
addDataSetRefs(remotePortRefs, remotePortProcessRef);
}
// Add RemoteOutputPort process, so that it can be found even if it is connected to RemoteInputPort directory without any processor in between.
nifiFlow.getFlowPaths().put(remotePortProcess.getId(), remotePortProcess);
}
}
use of org.apache.nifi.atlas.provenance.DataSetRefs in project nifi by apache.
the class TestHBaseTable method testHBaseTable.
@Test
public void testHBaseTable() {
final String processorName = "FetchHBaseRow";
final String transitUri = "hbase://0.example.com/tableA/rowB";
final ProvenanceEventRecord record = Mockito.mock(ProvenanceEventRecord.class);
when(record.getComponentType()).thenReturn(processorName);
when(record.getTransitUri()).thenReturn(transitUri);
when(record.getEventType()).thenReturn(ProvenanceEventType.FETCH);
final ClusterResolvers clusterResolvers = Mockito.mock(ClusterResolvers.class);
when(clusterResolvers.fromHostNames(matches(".+\\.example\\.com"))).thenReturn("cluster1");
final AnalysisContext context = Mockito.mock(AnalysisContext.class);
when(context.getClusterResolver()).thenReturn(clusterResolvers);
final NiFiProvenanceEventAnalyzer analyzer = NiFiProvenanceEventAnalyzerFactory.getAnalyzer(processorName, transitUri, record.getEventType());
assertNotNull(analyzer);
final DataSetRefs refs = analyzer.analyze(context, record);
assertEquals(1, refs.getInputs().size());
assertEquals(0, refs.getOutputs().size());
Referenceable ref = refs.getInputs().iterator().next();
assertEquals("hbase_table", ref.getTypeName());
assertEquals("tableA", ref.get(ATTR_NAME));
assertEquals("tableA@cluster1", ref.get(ATTR_QUALIFIED_NAME));
}
use of org.apache.nifi.atlas.provenance.DataSetRefs in project nifi by apache.
the class TestHDFSPath method testHDFSPath.
@Test
public void testHDFSPath() {
final String processorName = "PutHDFS";
// TODO: what if with HA namenode?
final String transitUri = "hdfs://0.example.com:8020/user/nifi/fileA";
final ProvenanceEventRecord record = Mockito.mock(ProvenanceEventRecord.class);
when(record.getComponentType()).thenReturn(processorName);
when(record.getTransitUri()).thenReturn(transitUri);
when(record.getEventType()).thenReturn(ProvenanceEventType.SEND);
final ClusterResolvers clusterResolvers = Mockito.mock(ClusterResolvers.class);
when(clusterResolvers.fromHostNames(matches(".+\\.example\\.com"))).thenReturn("cluster1");
final AnalysisContext context = Mockito.mock(AnalysisContext.class);
when(context.getClusterResolver()).thenReturn(clusterResolvers);
final NiFiProvenanceEventAnalyzer analyzer = NiFiProvenanceEventAnalyzerFactory.getAnalyzer(processorName, transitUri, record.getEventType());
assertNotNull(analyzer);
final DataSetRefs refs = analyzer.analyze(context, record);
assertEquals(0, refs.getInputs().size());
assertEquals(1, refs.getOutputs().size());
Referenceable ref = refs.getOutputs().iterator().next();
assertEquals("hdfs_path", ref.getTypeName());
assertEquals("/user/nifi/fileA", ref.get(ATTR_NAME));
assertEquals("/user/nifi/fileA@cluster1", ref.get(ATTR_QUALIFIED_NAME));
}
use of org.apache.nifi.atlas.provenance.DataSetRefs in project nifi by apache.
the class TestHive2JDBC method testTableLineage.
/**
* If a provenance event has table name attributes,
* then table lineages can be created.
*/
@Test
public void testTableLineage() {
final String processorName = "PutHiveQL";
final String transitUri = "jdbc:hive2://0.example.com:10000/databaseA";
final ProvenanceEventRecord record = Mockito.mock(ProvenanceEventRecord.class);
when(record.getComponentType()).thenReturn(processorName);
when(record.getTransitUri()).thenReturn(transitUri);
when(record.getEventType()).thenReturn(ProvenanceEventType.SEND);
// E.g. insert into databaseB.tableB1 select something from tableA1 a1 inner join tableA2 a2 where a1.id = a2.id
when(record.getAttribute(ATTR_INPUT_TABLES)).thenReturn("tableA1, tableA2");
when(record.getAttribute(ATTR_OUTPUT_TABLES)).thenReturn("databaseB.tableB1");
final ClusterResolvers clusterResolvers = Mockito.mock(ClusterResolvers.class);
when(clusterResolvers.fromHostNames(matches(".+\\.example\\.com"))).thenReturn("cluster1");
final AnalysisContext context = Mockito.mock(AnalysisContext.class);
when(context.getClusterResolver()).thenReturn(clusterResolvers);
final NiFiProvenanceEventAnalyzer analyzer = NiFiProvenanceEventAnalyzerFactory.getAnalyzer(processorName, transitUri, record.getEventType());
assertNotNull(analyzer);
final DataSetRefs refs = analyzer.analyze(context, record);
assertEquals(2, refs.getInputs().size());
// QualifiedName : Name
final Map<String, String> expectedInputRefs = new HashMap<>();
expectedInputRefs.put("databaseA.tableA1@cluster1", "tableA1");
expectedInputRefs.put("databaseA.tableA2@cluster1", "tableA2");
for (Referenceable ref : refs.getInputs()) {
final String qName = (String) ref.get(ATTR_QUALIFIED_NAME);
assertTrue(expectedInputRefs.containsKey(qName));
assertEquals(expectedInputRefs.get(qName), ref.get(ATTR_NAME));
}
assertEquals(1, refs.getOutputs().size());
Referenceable ref = refs.getOutputs().iterator().next();
assertEquals("hive_table", ref.getTypeName());
assertEquals("tableB1", ref.get(ATTR_NAME));
assertEquals("databaseB.tableB1@cluster1", ref.get(ATTR_QUALIFIED_NAME));
}
Aggregations