use of org.apache.nifi.provenance.serialization.RecordReader in project nifi by apache.
the class PersistentProvenanceRepository method recover.
private void recover() throws IOException {
long maxId = -1L;
long maxIndexedId = -1L;
long minIndexedId = Long.MAX_VALUE;
final List<File> filesToRecover = new ArrayList<>();
for (final File file : configuration.getStorageDirectories().values()) {
final File[] matchingFiles = file.listFiles(new FileFilter() {
@Override
public boolean accept(final File pathname) {
final String filename = pathname.getName();
if (!filename.contains(FILE_EXTENSION) || filename.endsWith(TEMP_FILE_SUFFIX)) {
return false;
}
final String baseFilename = filename.substring(0, filename.indexOf("."));
return NUMBER_PATTERN.matcher(baseFilename).matches();
}
});
for (final File matchingFile : matchingFiles) {
filesToRecover.add(matchingFile);
}
}
final SortedMap<Long, Path> sortedPathMap = new TreeMap<>(new Comparator<Long>() {
@Override
public int compare(final Long o1, final Long o2) {
return Long.compare(o1, o2);
}
});
File maxIdFile = null;
for (final File file : filesToRecover) {
final String filename = file.getName();
final String baseName = filename.substring(0, filename.indexOf("."));
final long firstId = Long.parseLong(baseName);
sortedPathMap.put(firstId, file.toPath());
if (firstId > maxId) {
maxId = firstId;
maxIdFile = file;
}
if (firstId > maxIndexedId) {
maxIndexedId = firstId - 1;
}
if (firstId < minIndexedId) {
minIndexedId = firstId;
}
}
if (maxIdFile != null) {
// Determine the max ID in the last file.
try (final RecordReader reader = RecordReaders.newRecordReader(maxIdFile, getAllLogFiles(), maxAttributeChars)) {
final long eventId = reader.getMaxEventId();
if (eventId > maxId) {
maxId = eventId;
}
// update the max indexed id
if (eventId > maxIndexedId) {
maxIndexedId = eventId;
}
} catch (final IOException ioe) {
logger.error("Failed to read Provenance Event File {} due to {}", maxIdFile, ioe);
logger.error("", ioe);
}
}
if (maxIndexedId > -1L) {
// If we have indexed anything then set the min/max ID's indexed.
indexConfig.setMaxIdIndexed(maxIndexedId);
}
if (minIndexedId < Long.MAX_VALUE) {
indexConfig.setMinIdIndexed(minIndexedId);
}
idGenerator.set(maxId + 1);
try {
final Set<File> recoveredJournals = recoverJournalFiles();
filesToRecover.addAll(recoveredJournals);
// Find the file that has the greatest ID
File greatestMinIdFile = null;
long greatestMinId = 0L;
for (final File recoveredJournal : recoveredJournals) {
// if the file was removed because the journals were empty, don't count it
if (!recoveredJournal.exists()) {
continue;
}
final String basename = LuceneUtil.substringBefore(recoveredJournal.getName(), ".");
try {
final long minId = Long.parseLong(basename);
sortedPathMap.put(minId, recoveredJournal.toPath());
if (greatestMinIdFile == null || minId > greatestMinId) {
greatestMinId = minId;
greatestMinIdFile = recoveredJournal;
}
} catch (final NumberFormatException nfe) {
// not a file we care about...
}
}
// Read the records in the last file to find its max id
if (greatestMinIdFile != null) {
try (final RecordReader recordReader = RecordReaders.newRecordReader(greatestMinIdFile, Collections.<Path>emptyList(), maxAttributeChars)) {
maxId = recordReader.getMaxEventId();
}
}
// set the ID Generator 1 greater than the max id
idGenerator.set(maxId + 1);
} catch (final IOException ioe) {
logger.error("Failed to recover Journal Files due to {}", ioe.toString());
logger.error("", ioe);
}
idToPathMap.set(Collections.unmodifiableSortedMap(sortedPathMap));
logger.trace("In recovery, path map: {}", sortedPathMap);
final long recordsRecovered;
if (minIndexedId < Long.MAX_VALUE) {
recordsRecovered = idGenerator.get() - minIndexedId;
} else {
recordsRecovered = idGenerator.get();
}
logger.info("Recovered {} records", recordsRecovered);
recoveryFinished.set(true);
}
use of org.apache.nifi.provenance.serialization.RecordReader in project nifi by apache.
the class PersistentProvenanceRepository method queryLucene.
/**
* This is for testing only and not actually used other than in debugging
*
* @param luceneQuery the lucene query to execute
* @return an Iterator of ProvenanceEventRecord that match the query
* @throws IOException if unable to perform the query
*/
Iterator<ProvenanceEventRecord> queryLucene(final org.apache.lucene.search.Query luceneQuery) throws IOException {
final List<File> indexFiles = indexConfig.getIndexDirectories();
final AtomicLong hits = new AtomicLong(0L);
final List<Future<List<Document>>> futures = new ArrayList<>();
for (final File indexDirectory : indexFiles) {
final Callable<List<Document>> callable = new Callable<List<Document>>() {
@Override
public List<Document> call() {
final List<Document> localScoreDocs = new ArrayList<>();
try (final DirectoryReader directoryReader = DirectoryReader.open(FSDirectory.open(indexDirectory))) {
final IndexSearcher searcher = new IndexSearcher(directoryReader);
final TopDocs topDocs = searcher.search(luceneQuery, 10000000);
logger.info("For {}, Top Docs has {} hits; reading Lucene results", indexDirectory, topDocs.scoreDocs.length);
if (topDocs.totalHits > 0) {
for (final ScoreDoc scoreDoc : topDocs.scoreDocs) {
final int docId = scoreDoc.doc;
final Document d = directoryReader.document(docId);
localScoreDocs.add(d);
}
}
hits.addAndGet(localScoreDocs.size());
} catch (final IndexNotFoundException e) {
} catch (final IOException ioe) {
throw new RuntimeException(ioe);
}
return localScoreDocs;
}
};
final Future<List<Document>> future = queryExecService.submit(callable);
futures.add(future);
}
logger.info("Merging results of Lucene query ({} hits)", hits.get());
List<Document> scoreDocs = null;
int idx = 0;
for (final Future<List<Document>> future : futures) {
try {
final List<Document> docs = future.get();
if (idx++ == 0) {
scoreDocs = docs;
} else {
scoreDocs.addAll(docs);
docs.clear();
}
} catch (final ExecutionException | InterruptedException ee) {
throw new RuntimeException(ee);
}
}
logger.info("Finished querying Lucene; there are {} docs; sorting for retrieval", scoreDocs.size());
LuceneUtil.sortDocsForRetrieval(scoreDocs);
logger.info("Finished sorting for retrieval. Returning Iterator.");
final Iterator<Document> docItr = scoreDocs.iterator();
final Collection<Path> allLogFiles = getAllLogFiles();
return new Iterator<ProvenanceEventRecord>() {
int count = 0;
RecordReader reader = null;
String lastStorageFilename = null;
long lastByteOffset = 0L;
@Override
public boolean hasNext() {
return docItr.hasNext();
}
@Override
public ProvenanceEventRecord next() {
if (count++ > 0) {
// remove last document so that we don't hold everything in memory.
docItr.remove();
}
final Document doc = docItr.next();
final String storageFilename = doc.getField(FieldNames.STORAGE_FILENAME).stringValue();
final long byteOffset = doc.getField(FieldNames.STORAGE_FILE_OFFSET).numericValue().longValue();
try {
if (reader != null && storageFilename.equals(lastStorageFilename) && byteOffset > lastByteOffset) {
// Still the same file and the offset is downstream.
try {
reader.skipTo(byteOffset);
final StandardProvenanceEventRecord record = reader.nextRecord();
return record;
} catch (final IOException e) {
if (hasNext()) {
return next();
} else {
return null;
}
}
} else {
if (reader != null) {
try {
reader.close();
} catch (final IOException ioe) {
}
}
final List<File> potentialFiles = LuceneUtil.getProvenanceLogFiles(storageFilename, allLogFiles);
if (potentialFiles.isEmpty()) {
if (hasNext()) {
return next();
} else {
return null;
}
}
if (potentialFiles.size() > 1) {
if (hasNext()) {
return next();
} else {
return null;
}
}
for (final File file : potentialFiles) {
try {
reader = RecordReaders.newRecordReader(file, allLogFiles, maxAttributeChars);
} catch (final IOException ioe) {
continue;
}
try {
reader.skip(byteOffset);
final StandardProvenanceEventRecord record = reader.nextRecord();
return record;
} catch (final IOException e) {
continue;
}
}
}
} finally {
lastStorageFilename = storageFilename;
lastByteOffset = byteOffset;
}
return null;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
use of org.apache.nifi.provenance.serialization.RecordReader in project nifi by apache.
the class PersistentProvenanceRepository method mergeJournals.
/**
* <p>
* Merges all of the given Journal Files into a single, merged Provenance
* Event Log File. As these records are merged, they will be compressed, if
* the repository is configured to compress records, and will be indexed.
* </p>
*
* <p>
* If the repository is configured to compress the data, the file written to
* may not be the same as the <code>suggestedMergeFile</code>, as a filename
* extension of '.gz' may be appended. If the journals are successfully
* merged, the file that they were merged into will be returned. If unable
* to merge the records (for instance, because the repository has been
* closed or because the list of journal files was empty), this method will
* return <code>null</code>.
* </p>
*
* @param journalFiles the journal files to merge
* @param suggestedMergeFile the file to write the merged records to
* @param eventReporter the event reporter to report any warnings or errors
* to; may be null.
*
* @return the file that the given journals were merged into, or
* <code>null</code> if no records were merged.
*
* @throws IOException if a problem occurs writing to the mergedFile,
* reading from a journal, or updating the Lucene Index.
*/
File mergeJournals(final List<File> journalFiles, final File suggestedMergeFile, final EventReporter eventReporter) throws IOException {
logger.debug("Merging {} to {}", journalFiles, suggestedMergeFile);
if (this.closed.get()) {
logger.info("Provenance Repository has been closed; will not merge journal files to {}", suggestedMergeFile);
return null;
}
if (journalFiles.isEmpty()) {
logger.debug("Couldn't merge journals: Journal Files is empty; won't merge journals");
return null;
}
Collections.sort(journalFiles, new Comparator<File>() {
@Override
public int compare(final File o1, final File o2) {
final String suffix1 = LuceneUtil.substringAfterLast(o1.getName(), ".");
final String suffix2 = LuceneUtil.substringAfterLast(o2.getName(), ".");
try {
final int journalIndex1 = Integer.parseInt(suffix1);
final int journalIndex2 = Integer.parseInt(suffix2);
return Integer.compare(journalIndex1, journalIndex2);
} catch (final NumberFormatException nfe) {
return o1.getName().compareTo(o2.getName());
}
}
});
// Search for any missing files. At this point they should have been written to disk otherwise cannot continue.
// Missing files is most likely due to incomplete cleanup of files post merge
final List<File> availableFiles = filterUnavailableFiles(journalFiles);
final int numAvailableFiles = availableFiles.size();
// check if we have all of the "partial" files for the journal.
if (numAvailableFiles > 0) {
if (suggestedMergeFile.exists()) {
// we have all "partial" files and there is already a merged file. Delete the data from the index
// because the merge file may not be fully merged. We will re-merge.
logger.warn("Merged Journal File {} already exists; however, all partial journal files also exist " + "so assuming that the merge did not finish. Repeating procedure in order to ensure consistency.");
final DeleteIndexAction deleteAction = new DeleteIndexAction(this, indexConfig, getIndexManager());
try {
deleteAction.execute(suggestedMergeFile);
} catch (final Exception e) {
logger.warn("Failed to delete records from Journal File {} from the index; this could potentially result in duplicates. Failure was due to {}", suggestedMergeFile, e.toString());
if (logger.isDebugEnabled()) {
logger.warn("", e);
}
}
// file and the TOC file. Otherwise, we could get the wrong copy and have issues retrieving events.
if (!suggestedMergeFile.delete()) {
logger.error("Failed to delete partially written Provenance Journal File {}. This may result in events from this journal " + "file not being able to be displayed. This file should be deleted manually.", suggestedMergeFile);
}
final File tocFile = TocUtil.getTocFile(suggestedMergeFile);
if (tocFile.exists() && !tocFile.delete()) {
logger.error("Failed to delete .toc file {}; this may result in not being able to read the Provenance Events from the {} Journal File. " + "This can be corrected by manually deleting the {} file", tocFile, suggestedMergeFile, tocFile);
}
}
} else {
logger.warn("Cannot merge journal files {} because they do not exist on disk", journalFiles);
return null;
}
final long startNanos = System.nanoTime();
// Map each journal to a RecordReader
final List<RecordReader> readers = new ArrayList<>();
int records = 0;
final boolean isCompress = configuration.isCompressOnRollover();
final File writerFile = isCompress ? new File(suggestedMergeFile.getParentFile(), suggestedMergeFile.getName() + ".gz") : suggestedMergeFile;
try {
for (final File journalFile : availableFiles) {
try {
// Use MAX_VALUE for number of chars because we don't want to truncate the value as we write it
// out. This allows us to later decide that we want more characters and still be able to retrieve
// the entire event.
readers.add(RecordReaders.newRecordReader(journalFile, null, Integer.MAX_VALUE));
} catch (final EOFException eof) {
// there's nothing here. Skip over it.
} catch (final IOException ioe) {
logger.warn("Unable to merge {} with other Journal Files due to {}", journalFile, ioe.toString());
if (logger.isDebugEnabled()) {
logger.warn("", ioe);
}
if (eventReporter != null) {
eventReporter.reportEvent(Severity.ERROR, EVENT_CATEGORY, "Failed to merge Journal Files due to " + ioe.toString());
}
}
}
// Create a Map so that the key is the next record available from a reader and the value is the Reader from which
// the record came. This sorted map is then used so that we are able to always get the first entry, which is the next
// lowest record id
final SortedMap<StandardProvenanceEventRecord, RecordReader> recordToReaderMap = new TreeMap<>(new Comparator<StandardProvenanceEventRecord>() {
@Override
public int compare(final StandardProvenanceEventRecord o1, final StandardProvenanceEventRecord o2) {
return Long.compare(o1.getEventId(), o2.getEventId());
}
});
long minEventId = 0L;
long earliestTimestamp = System.currentTimeMillis();
for (final RecordReader reader : readers) {
StandardProvenanceEventRecord record = null;
try {
record = reader.nextRecord();
} catch (final EOFException eof) {
// record will be null and reader can no longer be used
} catch (final Exception e) {
logger.warn("Failed to generate Provenance Event Record from Journal due to " + e + "; it's " + "possible that the record wasn't completely written to the file. This journal will be " + "skipped.");
if (logger.isDebugEnabled()) {
logger.warn("", e);
}
if (eventReporter != null) {
eventReporter.reportEvent(Severity.WARNING, EVENT_CATEGORY, "Failed to read Provenance Event " + "Record from Journal due to " + e + "; it's possible that the record wasn't " + "completely written to the file. This journal will be skipped.");
}
}
if (record == null) {
continue;
}
if (record.getEventTime() < earliestTimestamp) {
earliestTimestamp = record.getEventTime();
}
if (record.getEventId() < minEventId) {
minEventId = record.getEventId();
}
recordToReaderMap.put(record, reader);
}
// We want to keep track of the last 1000 events in the files so that we can add them to 'ringBuffer'.
// However, we don't want to add them directly to ringBuffer, because once they are added to ringBuffer, they are
// available in query results. As a result, we can have the issue where we've not finished indexing the file
// but we try to create the lineage for events in that file. In order to avoid this, we will add the records
// to a temporary RingBuffer and after we finish merging the records will then copy the data to the
// ringBuffer provided as a method argument.
final RingBuffer<ProvenanceEventRecord> latestRecords = new RingBuffer<>(1000);
// with the next entry from the journal file from which the previous record was written.
try (final RecordWriter writer = RecordWriters.newSchemaRecordWriter(writerFile, idGenerator, configuration.isCompressOnRollover(), true)) {
writer.writeHeader(minEventId);
final IndexingAction indexingAction = createIndexingAction();
final File indexingDirectory = indexConfig.getWritableIndexDirectory(writerFile, earliestTimestamp);
long maxId = 0L;
final BlockingQueue<Tuple<StandardProvenanceEventRecord, Integer>> eventQueue = new LinkedBlockingQueue<>(100);
final AtomicBoolean finishedAdding = new AtomicBoolean(false);
final List<Future<?>> futures = new ArrayList<>();
final EventIndexWriter indexWriter = getIndexManager().borrowIndexWriter(indexingDirectory);
try {
final ExecutorService exec = Executors.newFixedThreadPool(configuration.getIndexThreadPoolSize(), new ThreadFactory() {
@Override
public Thread newThread(final Runnable r) {
final Thread t = Executors.defaultThreadFactory().newThread(r);
t.setName("Index Provenance Events");
return t;
}
});
final AtomicInteger indexingFailureCount = new AtomicInteger(0);
try {
for (int i = 0; i < configuration.getIndexThreadPoolSize(); i++) {
final Callable<Object> callable = new Callable<Object>() {
@Override
public Object call() throws IOException {
while (!eventQueue.isEmpty() || !finishedAdding.get()) {
try {
final Tuple<StandardProvenanceEventRecord, Integer> tuple;
try {
tuple = eventQueue.poll(10, TimeUnit.MILLISECONDS);
} catch (final InterruptedException ie) {
Thread.currentThread().interrupt();
continue;
}
if (tuple == null) {
continue;
}
indexingAction.index(tuple.getKey(), indexWriter.getIndexWriter(), tuple.getValue());
} catch (final Throwable t) {
logger.error("Failed to index Provenance Event for " + writerFile + " to " + indexingDirectory, t);
if (indexingFailureCount.incrementAndGet() >= MAX_INDEXING_FAILURE_COUNT) {
return null;
}
}
}
return null;
}
};
final Future<?> future = exec.submit(callable);
futures.add(future);
}
boolean indexEvents = true;
while (!recordToReaderMap.isEmpty()) {
final StandardProvenanceEventRecord record = recordToReaderMap.firstKey();
final RecordReader reader = recordToReaderMap.get(record);
writer.writeRecord(record);
final int blockIndex = writer.getTocWriter().getCurrentBlockIndex();
boolean accepted = false;
while (!accepted && indexEvents) {
try {
accepted = eventQueue.offer(new Tuple<>(record, blockIndex), 10, TimeUnit.MILLISECONDS);
} catch (final InterruptedException ie) {
Thread.currentThread().interrupt();
}
// So, if the queue is filled, we will check if this is the case.
if (!accepted && indexingFailureCount.get() >= MAX_INDEXING_FAILURE_COUNT) {
// don't add anything else to the queue.
indexEvents = false;
eventQueue.clear();
final String warning = String.format("Indexing Provenance Events for %s has failed %s times. This exceeds the maximum threshold of %s failures, " + "so no more Provenance Events will be indexed for this Provenance file.", writerFile, indexingFailureCount.get(), MAX_INDEXING_FAILURE_COUNT);
logger.warn(warning);
if (eventReporter != null) {
eventReporter.reportEvent(Severity.WARNING, EVENT_CATEGORY, warning);
}
}
}
maxId = record.getEventId();
latestRecords.add(truncateAttributes(record));
records++;
// Remove this entry from the map
recordToReaderMap.remove(record);
// Get the next entry from this reader and add it to the map
StandardProvenanceEventRecord nextRecord = null;
try {
nextRecord = reader.nextRecord();
} catch (final EOFException eof) {
// record will be null and reader can no longer be used
} catch (final Exception e) {
logger.warn("Failed to generate Provenance Event Record from Journal due to " + e + "; it's possible that the record wasn't completely written to the file. " + "The remainder of this journal will be skipped.");
if (logger.isDebugEnabled()) {
logger.warn("", e);
}
if (eventReporter != null) {
eventReporter.reportEvent(Severity.WARNING, EVENT_CATEGORY, "Failed to read " + "Provenance Event Record from Journal due to " + e + "; it's possible " + "that the record wasn't completely written to the file. The remainder " + "of this journal will be skipped.");
}
}
if (nextRecord != null) {
recordToReaderMap.put(nextRecord, reader);
}
}
} finally {
finishedAdding.set(true);
exec.shutdown();
}
for (final Future<?> future : futures) {
try {
future.get();
} catch (final ExecutionException ee) {
final Throwable t = ee.getCause();
if (t instanceof RuntimeException) {
throw (RuntimeException) t;
}
throw new RuntimeException(t);
} catch (final InterruptedException e) {
Thread.currentThread().interrupt();
throw new RuntimeException("Thread interrupted");
}
}
} finally {
getIndexManager().returnIndexWriter(indexWriter);
}
indexConfig.setMaxIdIndexed(maxId);
}
// record should now be available in the repository. We can copy the values from latestRecords to ringBuffer.
final RingBuffer<ProvenanceEventRecord> latestRecordBuffer = this.latestRecords;
latestRecords.forEach(new ForEachEvaluator<ProvenanceEventRecord>() {
@Override
public boolean evaluate(final ProvenanceEventRecord event) {
latestRecordBuffer.add(event);
return true;
}
});
} finally {
for (final RecordReader reader : readers) {
try {
reader.close();
} catch (final IOException ioe) {
}
}
}
// Success. Remove all of the journal files, as they're no longer needed, now that they've been merged.
for (final File journalFile : availableFiles) {
if (!journalFile.delete() && journalFile.exists()) {
logger.warn("Failed to remove temporary journal file {}; this file should be cleaned up manually", journalFile.getAbsolutePath());
if (eventReporter != null) {
eventReporter.reportEvent(Severity.WARNING, EVENT_CATEGORY, "Failed to remove temporary journal file " + journalFile.getAbsolutePath() + "; this file should be cleaned up manually");
}
}
final File tocFile = TocUtil.getTocFile(journalFile);
if (!tocFile.delete() && tocFile.exists()) {
logger.warn("Failed to remove temporary journal TOC file {}; this file should be cleaned up manually", tocFile.getAbsolutePath());
if (eventReporter != null) {
eventReporter.reportEvent(Severity.WARNING, EVENT_CATEGORY, "Failed to remove temporary journal TOC file " + tocFile.getAbsolutePath() + "; this file should be cleaned up manually");
}
}
}
if (records == 0) {
writerFile.delete();
logger.debug("Couldn't merge journals: No Records to merge");
return null;
} else {
final long nanos = System.nanoTime() - startNanos;
final long millis = TimeUnit.MILLISECONDS.convert(nanos, TimeUnit.NANOSECONDS);
logger.info("Successfully merged {} journal files ({} records) into single Provenance Log File {} in {} milliseconds", numAvailableFiles, records, suggestedMergeFile, millis);
}
return writerFile;
}
use of org.apache.nifi.provenance.serialization.RecordReader in project nifi by apache.
the class DeleteIndexAction method execute.
@Override
public File execute(final File expiredFile) throws IOException {
// count the number of records and determine the max event id that we are deleting.
final long numDeleted = 0;
long maxEventId = -1L;
try (final RecordReader reader = RecordReaders.newRecordReader(expiredFile, repository.getAllLogFiles(), Integer.MAX_VALUE)) {
maxEventId = reader.getMaxEventId();
} catch (final IOException ioe) {
logger.warn("Failed to obtain max ID present in journal file {}", expiredFile.getAbsolutePath());
}
// remove the records from the index
final List<File> indexDirs = indexConfiguration.getIndexDirectories(expiredFile);
for (final File indexingDirectory : indexDirs) {
final Term term = new Term(FieldNames.STORAGE_FILENAME, LuceneUtil.substringBefore(expiredFile.getName(), "."));
boolean deleteDir = false;
final EventIndexWriter writer = indexManager.borrowIndexWriter(indexingDirectory);
try {
final IndexWriter indexWriter = writer.getIndexWriter();
indexWriter.deleteDocuments(term);
indexWriter.commit();
final int docsLeft = indexWriter.numDocs();
deleteDir = docsLeft <= 0;
logger.debug("After expiring {}, there are {} docs left for index {}", expiredFile, docsLeft, indexingDirectory);
} finally {
indexManager.returnIndexWriter(writer);
}
// we've confirmed that all documents have been removed. Delete the index directory.
if (deleteDir) {
indexManager.removeIndex(indexingDirectory);
indexConfiguration.removeIndexDirectory(indexingDirectory);
deleteDirectory(indexingDirectory);
logger.info("Removed empty index directory {}", indexingDirectory);
}
}
// Update the minimum index to 1 more than the max Event ID in this file.
if (maxEventId > -1L) {
indexConfiguration.setMinIdIndexed(maxEventId + 1L);
}
logger.info("Deleted Indices for Expired Provenance File {} from {} index files; {} documents removed", expiredFile, indexDirs.size(), numDeleted);
return expiredFile;
}
use of org.apache.nifi.provenance.serialization.RecordReader in project nifi by apache.
the class DocsReader method read.
public Set<ProvenanceEventRecord> read(final List<Document> docs, final EventAuthorizer authorizer, final Collection<Path> allProvenanceLogFiles, final AtomicInteger retrievalCount, final int maxResults, final int maxAttributeChars) throws IOException {
if (retrievalCount.get() >= maxResults) {
return Collections.emptySet();
}
final long start = System.nanoTime();
final Set<ProvenanceEventRecord> matchingRecords = new LinkedHashSet<>();
final Map<String, List<Document>> byStorageNameDocGroups = LuceneUtil.groupDocsByStorageFileName(docs);
int eventsReadThisFile = 0;
int logFileCount = 0;
for (String storageFileName : byStorageNameDocGroups.keySet()) {
final File provenanceEventFile = LuceneUtil.getProvenanceLogFile(storageFileName, allProvenanceLogFiles);
if (provenanceEventFile == null) {
logger.warn("Could not find Provenance Log File with " + "basename {} in the Provenance Repository; assuming " + "file has expired and continuing without it", storageFileName);
continue;
}
try (final RecordReader reader = RecordReaders.newRecordReader(provenanceEventFile, allProvenanceLogFiles, maxAttributeChars)) {
final Iterator<Document> docIter = byStorageNameDocGroups.get(storageFileName).iterator();
while (docIter.hasNext() && retrievalCount.getAndIncrement() < maxResults) {
final ProvenanceEventRecord event = getRecord(docIter.next(), reader);
if (event != null && authorizer.isAuthorized(event)) {
matchingRecords.add(event);
eventsReadThisFile++;
}
}
} catch (final Exception e) {
logger.warn("Failed to read Provenance Events. The event file '" + provenanceEventFile.getAbsolutePath() + "' may be missing or corrupt.", e);
}
}
logger.debug("Read {} records from previous file", eventsReadThisFile);
final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
logger.debug("Took {} ms to read {} events from {} prov log files", millis, matchingRecords.size(), logFileCount);
return matchingRecords;
}
Aggregations