Search in sources :

Example 51 with ProvenanceEventRecord

use of org.apache.nifi.provenance.ProvenanceEventRecord in project nifi by apache.

the class LineageQuery method computeLineageForFlowFiles.

public static Set<ProvenanceEventRecord> computeLineageForFlowFiles(final IndexManager indexManager, final File indexDirectory, final String lineageIdentifier, final Collection<String> flowFileUuids, final DocumentToEventConverter docsToEventConverter) throws IOException {
    if (requireNonNull(flowFileUuids).size() > MAX_LINEAGE_UUIDS) {
        throw new IllegalArgumentException(String.format("Cannot compute lineage for more than %s FlowFiles. This lineage contains %s.", MAX_LINEAGE_UUIDS, flowFileUuids.size()));
    if (lineageIdentifier == null && (flowFileUuids == null || flowFileUuids.isEmpty())) {
        throw new IllegalArgumentException("Must specify either Lineage Identifier or FlowFile UUIDs to compute lineage");
    final EventIndexSearcher searcher;
    try {
        searcher = indexManager.borrowIndexSearcher(indexDirectory);
        try {
            // Create a query for all Events related to the FlowFiles of interest. We do this by adding all ID's as
            // "SHOULD" clauses and then setting the minimum required to 1.
            final BooleanQuery flowFileIdQuery;
            if (flowFileUuids == null || flowFileUuids.isEmpty()) {
                flowFileIdQuery = null;
            } else {
                flowFileIdQuery = new BooleanQuery();
                for (final String flowFileUuid : flowFileUuids) {
                    flowFileIdQuery.add(new TermQuery(new Term(SearchableFields.FlowFileUUID.getSearchableFieldName(), flowFileUuid)), Occur.SHOULD);
            final long searchStart = System.nanoTime();
            logger.debug("Searching {} for {}", indexDirectory, flowFileIdQuery);
            final TopDocs uuidQueryTopDocs = searcher.getIndexSearcher().search(flowFileIdQuery, MAX_QUERY_RESULTS);
            final long searchEnd = System.nanoTime();
            final Set<ProvenanceEventRecord> recs = docsToEventConverter.convert(uuidQueryTopDocs, searcher.getIndexSearcher().getIndexReader());
            final long readDocsEnd = System.nanoTime();
            logger.debug("Finished Lineage Query against {}; Lucene search took {} millis, reading records took {} millis", indexDirectory, TimeUnit.NANOSECONDS.toMillis(searchEnd - searchStart), TimeUnit.NANOSECONDS.toMillis(readDocsEnd - searchEnd));
            return recs;
        } finally {
    } catch (final FileNotFoundException fnfe) {
        // nothing has been indexed yet, or the data has already aged off
        logger.warn("Attempted to search Provenance Index {} but could not find the file due to {}", indexDirectory, fnfe);
        if (logger.isDebugEnabled()) {
            logger.warn("", fnfe);
        return Collections.emptySet();
Also used : TopDocs( BooleanQuery( TermQuery( EventIndexSearcher(org.apache.nifi.provenance.index.EventIndexSearcher) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) FileNotFoundException( Term(org.apache.lucene.index.Term)

Example 52 with ProvenanceEventRecord

use of org.apache.nifi.provenance.ProvenanceEventRecord in project nifi by apache.

the class WriteAheadStorePartition method reindexLatestEvents.

void reindexLatestEvents(final EventIndex eventIndex) {
    final List<File> eventFiles = getEventFilesFromDisk().sorted(DirectoryUtils.SMALLEST_ID_FIRST).collect(Collectors.toList());
    if (eventFiles.isEmpty()) {
    final long minEventIdToReindex = eventIndex.getMinimumEventIdToReindex(partitionName);
    final long maxEventId = getMaxEventId();
    final long eventsToReindex = maxEventId - minEventIdToReindex;"The last Provenance Event indexed for partition {} is {}, but the last event written to partition has ID {}. " + "Re-indexing up to the last {} events to ensure that the Event Index is accurate and up-to-date", partitionName, minEventIdToReindex, maxEventId, eventsToReindex, partitionDirectory);
    // Find the first event file that we care about.
    int firstEventFileIndex = 0;
    for (int i = eventFiles.size() - 1; i >= 0; i--) {
        final File eventFile = eventFiles.get(i);
        final long minIdInFile = DirectoryUtils.getMinId(eventFile);
        if (minIdInFile <= minEventIdToReindex) {
            firstEventFileIndex = i;
    // Create a subList that contains the files of interest
    final List<File> eventFilesToReindex = eventFiles.subList(firstEventFileIndex, eventFiles.size());
    final ExecutorService executor = Executors.newFixedThreadPool(Math.min(4, eventFilesToReindex.size()), new NamedThreadFactory("Re-Index Provenance Events", true));
    final List<Future<?>> futures = new ArrayList<>(eventFilesToReindex.size());
    final AtomicLong reindexedCount = new AtomicLong(0L);
    // Re-Index the last bunch of events.
    // We don't use an Event Iterator here because it's possible that one of the event files could be corrupt (for example, if NiFi does while
    // writing to the file, a record may be incomplete). We don't want to prevent us from moving on and continuing to index the rest of the
    // un-indexed events. So we just use a List of files and create a reader for each one.
    final long start = System.nanoTime();
    int fileCount = 0;
    for (final File eventFile : eventFilesToReindex) {
        final boolean skipToEvent;
        if (fileCount++ == 0) {
            skipToEvent = true;
        } else {
            skipToEvent = false;
        final Runnable reindexTask = new Runnable() {

            public void run() {
                final Map<ProvenanceEventRecord, StorageSummary> storageMap = new HashMap<>(1000);
                try (final RecordReader recordReader = recordReaderFactory.newRecordReader(eventFile, Collections.emptyList(), Integer.MAX_VALUE)) {
                    if (skipToEvent) {
                        final Optional<ProvenanceEventRecord> eventOption = recordReader.skipToEvent(minEventIdToReindex);
                        if (!eventOption.isPresent()) {
                    StandardProvenanceEventRecord event = null;
                    while (true) {
                        final long startBytesConsumed = recordReader.getBytesConsumed();
                        event = recordReader.nextRecord();
                        if (event == null) {
                            // stop reading from this file
                        } else {
                            final long eventSize = recordReader.getBytesConsumed() - startBytesConsumed;
                            storageMap.put(event, new StorageSummary(event.getEventId(), eventFile.getName(), partitionName, recordReader.getBlockIndex(), eventSize, 0L));
                            if (storageMap.size() == 1000) {
                } catch (final EOFException eof) {
                    // Ran out of data. Continue on.
                    logger.warn("Failed to find event with ID {} in Event File {} due to {}", minEventIdToReindex, eventFile, eof.toString());
                } catch (final Exception e) {
                    logger.error("Failed to index Provenance Events found in {}", eventFile, e);
    for (final Future<?> future : futures) {
        try {
        } catch (final ExecutionException ee) {
            logger.error("Failed to re-index some Provenance events. These events may not be query-able via the Provenance interface", ee.getCause());
        } catch (final InterruptedException e) {
            logger.error("Interrupted while waiting for Provenance events to be re-indexed", e);
    try {
    } catch (final IOException e) {
        logger.error("Failed to re-index Provenance Events for partition " + partitionName, e);
    final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
    final long seconds = millis / 1000L;
    final long millisRemainder = millis % 1000L;"Finished re-indexing {} events across {} files for {} in {}.{} seconds", reindexedCount.get(), eventFilesToReindex.size(), partitionDirectory, seconds, millisRemainder);
Also used : HashMap(java.util.HashMap) RecordReader(org.apache.nifi.provenance.serialization.RecordReader) ArrayList(java.util.ArrayList) StandardProvenanceEventRecord(org.apache.nifi.provenance.StandardProvenanceEventRecord) StorageSummary(org.apache.nifi.provenance.serialization.StorageSummary) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) StandardProvenanceEventRecord(org.apache.nifi.provenance.StandardProvenanceEventRecord) EOFException( ExecutionException(java.util.concurrent.ExecutionException) NamedThreadFactory(org.apache.nifi.provenance.util.NamedThreadFactory) IOException( IOException( EOFException( ExecutionException(java.util.concurrent.ExecutionException) AtomicLong(java.util.concurrent.atomic.AtomicLong) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) File(

Example 53 with ProvenanceEventRecord

use of org.apache.nifi.provenance.ProvenanceEventRecord in project nifi by apache.

the class SelectiveRecordReaderEventIterator method nextEvent.

public Optional<ProvenanceEventRecord> nextEvent() throws IOException {
    if (closed) {
        throw new IOException("EventIterator is already closed");
    final long start = System.nanoTime();
    try {
        while (idIterator.hasNext()) {
            // Determine the next event ID to fetch
            final long eventId =;
            // Determine which file the event should be in.
            final File fileForEvent = getFileForEventId(eventId);
            if (fileForEvent == null) {
            try {
                // we are currently reading from, rotate the reader to the appropriate one.
                if (!fileForEvent.equals(currentFile)) {
                    if (reader != null) {
                        try {
                        } catch (final Exception e) {
                            logger.warn("Failed to close {}; some resources may not be cleaned up appropriately", reader);
                    reader = readerFactory.newRecordReader(fileForEvent, Collections.emptyList(), maxAttributeChars);
                    this.currentFile = fileForEvent;
                final Optional<ProvenanceEventRecord> eventOption = reader.skipToEvent(eventId);
                if (eventOption.isPresent() && eventOption.get().getEventId() == eventId) {
                    // consume the event from the stream.
                    return eventOption;
            } catch (final FileNotFoundException | EOFException e) {
                logger.warn("Failed to retrieve Event with ID {}", eventId, e);
        return Optional.empty();
    } finally {
        final long ms = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
        logger.trace("Took {} ms to read next event", ms);
Also used : ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) FileNotFoundException( EOFException( IOException( File( IOException( EOFException( FileNotFoundException(

Example 54 with ProvenanceEventRecord

use of org.apache.nifi.provenance.ProvenanceEventRecord in project nifi by apache.

the class PutParquetTest method testWriteAvroParquetWithDefaults.

public void testWriteAvroParquetWithDefaults() throws IOException, InitializationException {
    configure(proc, 100);
    final String filename = "testWriteAvroWithDefaults-" + System.currentTimeMillis();
    final Map<String, String> flowFileAttributes = new HashMap<>();
    flowFileAttributes.put(CoreAttributes.FILENAME.key(), filename);
    testRunner.enqueue("trigger", flowFileAttributes);;
    testRunner.assertAllFlowFilesTransferred(PutParquet.REL_SUCCESS, 1);
    final Path avroParquetFile = new Path(DIRECTORY + "/" + filename);
    // verify the successful flow file has the expected attributes
    final MockFlowFile mockFlowFile = testRunner.getFlowFilesForRelationship(PutParquet.REL_SUCCESS).get(0);
    mockFlowFile.assertAttributeEquals(PutParquet.ABSOLUTE_HDFS_PATH_ATTRIBUTE, avroParquetFile.getParent().toString());
    mockFlowFile.assertAttributeEquals(CoreAttributes.FILENAME.key(), filename);
    mockFlowFile.assertAttributeEquals(PutParquet.RECORD_COUNT_ATTR, "100");
    // verify we generated a provenance event
    final List<ProvenanceEventRecord> provEvents = testRunner.getProvenanceEvents();
    Assert.assertEquals(1, provEvents.size());
    // verify it was a SEND event with the correct URI
    final ProvenanceEventRecord provEvent = provEvents.get(0);
    Assert.assertEquals(ProvenanceEventType.SEND, provEvent.getEventType());
    // If it runs with a real HDFS, the protocol will be "hdfs://", but with a local filesystem, just assert the filename.
    Assert.assertTrue(provEvent.getTransitUri().endsWith(DIRECTORY + "/" + filename));
    // verify the content of the parquet file by reading it back in
    verifyAvroParquetUsers(avroParquetFile, 100);
    // verify we don't have the temp dot file after success
    final File tempAvroParquetFile = new File(DIRECTORY + "/." + filename);
    // verify we DO have the CRC file after success
    final File crcAvroParquetFile = new File(DIRECTORY + "/." + filename + ".crc");
Also used : Path(org.apache.hadoop.fs.Path) MockFlowFile(org.apache.nifi.util.MockFlowFile) HashMap(java.util.HashMap) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) FlowFile(org.apache.nifi.flowfile.FlowFile) File( MockFlowFile(org.apache.nifi.util.MockFlowFile) Test(org.junit.Test)

Example 55 with ProvenanceEventRecord

use of org.apache.nifi.provenance.ProvenanceEventRecord in project nifi by apache.

the class TestSiteToSiteProvenanceReportingTask method testFilterProcessGroupId.

public void testFilterProcessGroupId() throws IOException, InitializationException {
    final Map<PropertyDescriptor, String> properties = new HashMap<>();
    for (final PropertyDescriptor descriptor : new MockSiteToSiteProvenanceReportingTask().getSupportedPropertyDescriptors()) {
        properties.put(descriptor, descriptor.getDefaultValue());
    properties.put(SiteToSiteProvenanceReportingTask.BATCH_SIZE, "1000");
    properties.put(SiteToSiteProvenanceReportingTask.FILTER_COMPONENT_ID, "pgB2");
    // B201 belongs to ProcessGroup B2, so it should be picked.
    ProvenanceEventRecord event = createProvenanceEventRecord("B201", "dummy");
    MockSiteToSiteProvenanceReportingTask task = setup(event, properties, 1);
    assertEquals(1, task.dataSent.size());
    JsonNode reportedEvent = new ObjectMapper().readTree(task.dataSent.get(0)).get(0);
    assertEquals("B201", reportedEvent.get("componentId").asText());
    assertEquals("Processor in PGB2", reportedEvent.get("componentName").asText());
    // B301 belongs to PG B3, whose parent is PGB2, so it should be picked, too.
    event = createProvenanceEventRecord("B301", "dummy");
    task = setup(event, properties, 1);
    assertEquals(1, task.dataSent.size());
    reportedEvent = new ObjectMapper().readTree(task.dataSent.get(0)).get(0);
    assertEquals("B301", reportedEvent.get("componentId").asText());
    assertEquals("Processor in PGB3", reportedEvent.get("componentName").asText());
    // A001 belongs to PG A, whose parent is the root PG, so it should be filtered out.
    event = createProvenanceEventRecord("A001", "dummy");
    task = setup(event, properties, 1);
    assertEquals(0, task.dataSent.size());
Also used : PropertyDescriptor(org.apache.nifi.components.PropertyDescriptor) HashMap(java.util.HashMap) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) StandardProvenanceEventRecord(org.apache.nifi.provenance.StandardProvenanceEventRecord) JsonNode(com.fasterxml.jackson.databind.JsonNode) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Test(org.junit.Test)


ProvenanceEventRecord (org.apache.nifi.provenance.ProvenanceEventRecord)101 Test (org.junit.Test)66 HashMap (java.util.HashMap)34 StandardProvenanceEventRecord (org.apache.nifi.provenance.StandardProvenanceEventRecord)34 MockFlowFile (org.apache.nifi.util.MockFlowFile)32 TestRunner (org.apache.nifi.util.TestRunner)17 IOException ( ArrayList (java.util.ArrayList)13 PropertyDescriptor (org.apache.nifi.components.PropertyDescriptor)11 AnalysisContext (org.apache.nifi.atlas.provenance.AnalysisContext)10 DataSetRefs (org.apache.nifi.atlas.provenance.DataSetRefs)10 File ( List (java.util.List)9 Referenceable (org.apache.atlas.typesystem.Referenceable)9 NiFiProvenanceEventAnalyzer (org.apache.nifi.atlas.provenance.NiFiProvenanceEventAnalyzer)9 ClusterResolvers (org.apache.nifi.atlas.resolver.ClusterResolvers)9 AtomicLong (java.util.concurrent.atomic.AtomicLong)8 FlowFileHandlingException (org.apache.nifi.processor.exception.FlowFileHandlingException)8 Map (java.util.Map)7 Set (java.util.Set)7