use of org.apache.nifi.provenance.ProvenanceEventRecord in project nifi by apache.
the class LineageQuery method computeLineageForFlowFiles.
public static Set<ProvenanceEventRecord> computeLineageForFlowFiles(final IndexManager indexManager, final File indexDirectory, final String lineageIdentifier, final Collection<String> flowFileUuids, final DocumentToEventConverter docsToEventConverter) throws IOException {
if (requireNonNull(flowFileUuids).size() > MAX_LINEAGE_UUIDS) {
throw new IllegalArgumentException(String.format("Cannot compute lineage for more than %s FlowFiles. This lineage contains %s.", MAX_LINEAGE_UUIDS, flowFileUuids.size()));
}
if (lineageIdentifier == null && (flowFileUuids == null || flowFileUuids.isEmpty())) {
throw new IllegalArgumentException("Must specify either Lineage Identifier or FlowFile UUIDs to compute lineage");
}
final EventIndexSearcher searcher;
try {
searcher = indexManager.borrowIndexSearcher(indexDirectory);
try {
// Create a query for all Events related to the FlowFiles of interest. We do this by adding all ID's as
// "SHOULD" clauses and then setting the minimum required to 1.
final BooleanQuery flowFileIdQuery;
if (flowFileUuids == null || flowFileUuids.isEmpty()) {
flowFileIdQuery = null;
} else {
flowFileIdQuery = new BooleanQuery();
for (final String flowFileUuid : flowFileUuids) {
flowFileIdQuery.add(new TermQuery(new Term(SearchableFields.FlowFileUUID.getSearchableFieldName(), flowFileUuid)), Occur.SHOULD);
}
flowFileIdQuery.setMinimumNumberShouldMatch(1);
}
final long searchStart = System.nanoTime();
logger.debug("Searching {} for {}", indexDirectory, flowFileIdQuery);
final TopDocs uuidQueryTopDocs = searcher.getIndexSearcher().search(flowFileIdQuery, MAX_QUERY_RESULTS);
final long searchEnd = System.nanoTime();
final Set<ProvenanceEventRecord> recs = docsToEventConverter.convert(uuidQueryTopDocs, searcher.getIndexSearcher().getIndexReader());
final long readDocsEnd = System.nanoTime();
logger.debug("Finished Lineage Query against {}; Lucene search took {} millis, reading records took {} millis", indexDirectory, TimeUnit.NANOSECONDS.toMillis(searchEnd - searchStart), TimeUnit.NANOSECONDS.toMillis(readDocsEnd - searchEnd));
return recs;
} finally {
indexManager.returnIndexSearcher(searcher);
}
} catch (final FileNotFoundException fnfe) {
// nothing has been indexed yet, or the data has already aged off
logger.warn("Attempted to search Provenance Index {} but could not find the file due to {}", indexDirectory, fnfe);
if (logger.isDebugEnabled()) {
logger.warn("", fnfe);
}
return Collections.emptySet();
}
}
use of org.apache.nifi.provenance.ProvenanceEventRecord in project nifi by apache.
the class WriteAheadStorePartition method reindexLatestEvents.
void reindexLatestEvents(final EventIndex eventIndex) {
final List<File> eventFiles = getEventFilesFromDisk().sorted(DirectoryUtils.SMALLEST_ID_FIRST).collect(Collectors.toList());
if (eventFiles.isEmpty()) {
return;
}
final long minEventIdToReindex = eventIndex.getMinimumEventIdToReindex(partitionName);
final long maxEventId = getMaxEventId();
final long eventsToReindex = maxEventId - minEventIdToReindex;
logger.info("The last Provenance Event indexed for partition {} is {}, but the last event written to partition has ID {}. " + "Re-indexing up to the last {} events to ensure that the Event Index is accurate and up-to-date", partitionName, minEventIdToReindex, maxEventId, eventsToReindex, partitionDirectory);
// Find the first event file that we care about.
int firstEventFileIndex = 0;
for (int i = eventFiles.size() - 1; i >= 0; i--) {
final File eventFile = eventFiles.get(i);
final long minIdInFile = DirectoryUtils.getMinId(eventFile);
if (minIdInFile <= minEventIdToReindex) {
firstEventFileIndex = i;
break;
}
}
// Create a subList that contains the files of interest
final List<File> eventFilesToReindex = eventFiles.subList(firstEventFileIndex, eventFiles.size());
final ExecutorService executor = Executors.newFixedThreadPool(Math.min(4, eventFilesToReindex.size()), new NamedThreadFactory("Re-Index Provenance Events", true));
final List<Future<?>> futures = new ArrayList<>(eventFilesToReindex.size());
final AtomicLong reindexedCount = new AtomicLong(0L);
// Re-Index the last bunch of events.
// We don't use an Event Iterator here because it's possible that one of the event files could be corrupt (for example, if NiFi does while
// writing to the file, a record may be incomplete). We don't want to prevent us from moving on and continuing to index the rest of the
// un-indexed events. So we just use a List of files and create a reader for each one.
final long start = System.nanoTime();
int fileCount = 0;
for (final File eventFile : eventFilesToReindex) {
final boolean skipToEvent;
if (fileCount++ == 0) {
skipToEvent = true;
} else {
skipToEvent = false;
}
final Runnable reindexTask = new Runnable() {
@Override
public void run() {
final Map<ProvenanceEventRecord, StorageSummary> storageMap = new HashMap<>(1000);
try (final RecordReader recordReader = recordReaderFactory.newRecordReader(eventFile, Collections.emptyList(), Integer.MAX_VALUE)) {
if (skipToEvent) {
final Optional<ProvenanceEventRecord> eventOption = recordReader.skipToEvent(minEventIdToReindex);
if (!eventOption.isPresent()) {
return;
}
}
StandardProvenanceEventRecord event = null;
while (true) {
final long startBytesConsumed = recordReader.getBytesConsumed();
event = recordReader.nextRecord();
if (event == null) {
eventIndex.reindexEvents(storageMap);
reindexedCount.addAndGet(storageMap.size());
storageMap.clear();
// stop reading from this file
break;
} else {
final long eventSize = recordReader.getBytesConsumed() - startBytesConsumed;
storageMap.put(event, new StorageSummary(event.getEventId(), eventFile.getName(), partitionName, recordReader.getBlockIndex(), eventSize, 0L));
if (storageMap.size() == 1000) {
eventIndex.reindexEvents(storageMap);
reindexedCount.addAndGet(storageMap.size());
storageMap.clear();
}
}
}
} catch (final EOFException eof) {
// Ran out of data. Continue on.
logger.warn("Failed to find event with ID {} in Event File {} due to {}", minEventIdToReindex, eventFile, eof.toString());
} catch (final Exception e) {
logger.error("Failed to index Provenance Events found in {}", eventFile, e);
}
}
};
futures.add(executor.submit(reindexTask));
}
for (final Future<?> future : futures) {
try {
future.get();
} catch (final ExecutionException ee) {
logger.error("Failed to re-index some Provenance events. These events may not be query-able via the Provenance interface", ee.getCause());
} catch (final InterruptedException e) {
Thread.currentThread().interrupt();
logger.error("Interrupted while waiting for Provenance events to be re-indexed", e);
break;
}
}
try {
eventIndex.commitChanges(partitionName);
} catch (final IOException e) {
logger.error("Failed to re-index Provenance Events for partition " + partitionName, e);
}
executor.shutdown();
final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
final long seconds = millis / 1000L;
final long millisRemainder = millis % 1000L;
logger.info("Finished re-indexing {} events across {} files for {} in {}.{} seconds", reindexedCount.get(), eventFilesToReindex.size(), partitionDirectory, seconds, millisRemainder);
}
use of org.apache.nifi.provenance.ProvenanceEventRecord in project nifi by apache.
the class SelectiveRecordReaderEventIterator method nextEvent.
@Override
public Optional<ProvenanceEventRecord> nextEvent() throws IOException {
if (closed) {
throw new IOException("EventIterator is already closed");
}
final long start = System.nanoTime();
try {
while (idIterator.hasNext()) {
// Determine the next event ID to fetch
final long eventId = idIterator.next();
// Determine which file the event should be in.
final File fileForEvent = getFileForEventId(eventId);
if (fileForEvent == null) {
continue;
}
try {
// we are currently reading from, rotate the reader to the appropriate one.
if (!fileForEvent.equals(currentFile)) {
if (reader != null) {
try {
reader.close();
} catch (final Exception e) {
logger.warn("Failed to close {}; some resources may not be cleaned up appropriately", reader);
}
}
reader = readerFactory.newRecordReader(fileForEvent, Collections.emptyList(), maxAttributeChars);
this.currentFile = fileForEvent;
}
final Optional<ProvenanceEventRecord> eventOption = reader.skipToEvent(eventId);
if (eventOption.isPresent() && eventOption.get().getEventId() == eventId) {
// consume the event from the stream.
reader.nextRecord();
return eventOption;
}
} catch (final FileNotFoundException | EOFException e) {
logger.warn("Failed to retrieve Event with ID {}", eventId, e);
}
}
return Optional.empty();
} finally {
final long ms = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
logger.trace("Took {} ms to read next event", ms);
}
}
use of org.apache.nifi.provenance.ProvenanceEventRecord in project nifi by apache.
the class PutParquetTest method testWriteAvroParquetWithDefaults.
@Test
public void testWriteAvroParquetWithDefaults() throws IOException, InitializationException {
configure(proc, 100);
final String filename = "testWriteAvroWithDefaults-" + System.currentTimeMillis();
final Map<String, String> flowFileAttributes = new HashMap<>();
flowFileAttributes.put(CoreAttributes.FILENAME.key(), filename);
testRunner.enqueue("trigger", flowFileAttributes);
testRunner.run();
testRunner.assertAllFlowFilesTransferred(PutParquet.REL_SUCCESS, 1);
final Path avroParquetFile = new Path(DIRECTORY + "/" + filename);
// verify the successful flow file has the expected attributes
final MockFlowFile mockFlowFile = testRunner.getFlowFilesForRelationship(PutParquet.REL_SUCCESS).get(0);
mockFlowFile.assertAttributeEquals(PutParquet.ABSOLUTE_HDFS_PATH_ATTRIBUTE, avroParquetFile.getParent().toString());
mockFlowFile.assertAttributeEquals(CoreAttributes.FILENAME.key(), filename);
mockFlowFile.assertAttributeEquals(PutParquet.RECORD_COUNT_ATTR, "100");
// verify we generated a provenance event
final List<ProvenanceEventRecord> provEvents = testRunner.getProvenanceEvents();
Assert.assertEquals(1, provEvents.size());
// verify it was a SEND event with the correct URI
final ProvenanceEventRecord provEvent = provEvents.get(0);
Assert.assertEquals(ProvenanceEventType.SEND, provEvent.getEventType());
// If it runs with a real HDFS, the protocol will be "hdfs://", but with a local filesystem, just assert the filename.
Assert.assertTrue(provEvent.getTransitUri().endsWith(DIRECTORY + "/" + filename));
// verify the content of the parquet file by reading it back in
verifyAvroParquetUsers(avroParquetFile, 100);
// verify we don't have the temp dot file after success
final File tempAvroParquetFile = new File(DIRECTORY + "/." + filename);
Assert.assertFalse(tempAvroParquetFile.exists());
// verify we DO have the CRC file after success
final File crcAvroParquetFile = new File(DIRECTORY + "/." + filename + ".crc");
Assert.assertTrue(crcAvroParquetFile.exists());
}
use of org.apache.nifi.provenance.ProvenanceEventRecord in project nifi by apache.
the class TestSiteToSiteProvenanceReportingTask method testFilterProcessGroupId.
@Test
public void testFilterProcessGroupId() throws IOException, InitializationException {
final Map<PropertyDescriptor, String> properties = new HashMap<>();
for (final PropertyDescriptor descriptor : new MockSiteToSiteProvenanceReportingTask().getSupportedPropertyDescriptors()) {
properties.put(descriptor, descriptor.getDefaultValue());
}
properties.put(SiteToSiteProvenanceReportingTask.BATCH_SIZE, "1000");
properties.put(SiteToSiteProvenanceReportingTask.FILTER_COMPONENT_ID, "pgB2");
// B201 belongs to ProcessGroup B2, so it should be picked.
ProvenanceEventRecord event = createProvenanceEventRecord("B201", "dummy");
MockSiteToSiteProvenanceReportingTask task = setup(event, properties, 1);
task.initialize(initContext);
task.onScheduled(confContext);
task.onTrigger(context);
assertEquals(1, task.dataSent.size());
JsonNode reportedEvent = new ObjectMapper().readTree(task.dataSent.get(0)).get(0);
assertEquals("B201", reportedEvent.get("componentId").asText());
assertEquals("Processor in PGB2", reportedEvent.get("componentName").asText());
// B301 belongs to PG B3, whose parent is PGB2, so it should be picked, too.
event = createProvenanceEventRecord("B301", "dummy");
task = setup(event, properties, 1);
task.initialize(initContext);
task.onScheduled(confContext);
task.onTrigger(context);
assertEquals(1, task.dataSent.size());
reportedEvent = new ObjectMapper().readTree(task.dataSent.get(0)).get(0);
assertEquals("B301", reportedEvent.get("componentId").asText());
assertEquals("Processor in PGB3", reportedEvent.get("componentName").asText());
// A001 belongs to PG A, whose parent is the root PG, so it should be filtered out.
event = createProvenanceEventRecord("A001", "dummy");
task = setup(event, properties, 1);
task.initialize(initContext);
task.onScheduled(confContext);
task.onTrigger(context);
assertEquals(0, task.dataSent.size());
}
Aggregations