use of org.apache.nifi.provenance.serialization.RecordReader in project nifi by apache.
the class WriteAheadStorePartition method reindexLatestEvents.
void reindexLatestEvents(final EventIndex eventIndex) {
final List<File> eventFiles = getEventFilesFromDisk().sorted(DirectoryUtils.SMALLEST_ID_FIRST).collect(Collectors.toList());
if (eventFiles.isEmpty()) {
return;
}
final long minEventIdToReindex = eventIndex.getMinimumEventIdToReindex(partitionName);
final long maxEventId = getMaxEventId();
final long eventsToReindex = maxEventId - minEventIdToReindex;
logger.info("The last Provenance Event indexed for partition {} is {}, but the last event written to partition has ID {}. " + "Re-indexing up to the last {} events to ensure that the Event Index is accurate and up-to-date", partitionName, minEventIdToReindex, maxEventId, eventsToReindex, partitionDirectory);
// Find the first event file that we care about.
int firstEventFileIndex = 0;
for (int i = eventFiles.size() - 1; i >= 0; i--) {
final File eventFile = eventFiles.get(i);
final long minIdInFile = DirectoryUtils.getMinId(eventFile);
if (minIdInFile <= minEventIdToReindex) {
firstEventFileIndex = i;
break;
}
}
// Create a subList that contains the files of interest
final List<File> eventFilesToReindex = eventFiles.subList(firstEventFileIndex, eventFiles.size());
final ExecutorService executor = Executors.newFixedThreadPool(Math.min(4, eventFilesToReindex.size()), new NamedThreadFactory("Re-Index Provenance Events", true));
final List<Future<?>> futures = new ArrayList<>(eventFilesToReindex.size());
final AtomicLong reindexedCount = new AtomicLong(0L);
// Re-Index the last bunch of events.
// We don't use an Event Iterator here because it's possible that one of the event files could be corrupt (for example, if NiFi does while
// writing to the file, a record may be incomplete). We don't want to prevent us from moving on and continuing to index the rest of the
// un-indexed events. So we just use a List of files and create a reader for each one.
final long start = System.nanoTime();
int fileCount = 0;
for (final File eventFile : eventFilesToReindex) {
final boolean skipToEvent;
if (fileCount++ == 0) {
skipToEvent = true;
} else {
skipToEvent = false;
}
final Runnable reindexTask = new Runnable() {
@Override
public void run() {
final Map<ProvenanceEventRecord, StorageSummary> storageMap = new HashMap<>(1000);
try (final RecordReader recordReader = recordReaderFactory.newRecordReader(eventFile, Collections.emptyList(), Integer.MAX_VALUE)) {
if (skipToEvent) {
final Optional<ProvenanceEventRecord> eventOption = recordReader.skipToEvent(minEventIdToReindex);
if (!eventOption.isPresent()) {
return;
}
}
StandardProvenanceEventRecord event = null;
while (true) {
final long startBytesConsumed = recordReader.getBytesConsumed();
event = recordReader.nextRecord();
if (event == null) {
eventIndex.reindexEvents(storageMap);
reindexedCount.addAndGet(storageMap.size());
storageMap.clear();
// stop reading from this file
break;
} else {
final long eventSize = recordReader.getBytesConsumed() - startBytesConsumed;
storageMap.put(event, new StorageSummary(event.getEventId(), eventFile.getName(), partitionName, recordReader.getBlockIndex(), eventSize, 0L));
if (storageMap.size() == 1000) {
eventIndex.reindexEvents(storageMap);
reindexedCount.addAndGet(storageMap.size());
storageMap.clear();
}
}
}
} catch (final EOFException eof) {
// Ran out of data. Continue on.
logger.warn("Failed to find event with ID {} in Event File {} due to {}", minEventIdToReindex, eventFile, eof.toString());
} catch (final Exception e) {
logger.error("Failed to index Provenance Events found in {}", eventFile, e);
}
}
};
futures.add(executor.submit(reindexTask));
}
for (final Future<?> future : futures) {
try {
future.get();
} catch (final ExecutionException ee) {
logger.error("Failed to re-index some Provenance events. These events may not be query-able via the Provenance interface", ee.getCause());
} catch (final InterruptedException e) {
Thread.currentThread().interrupt();
logger.error("Interrupted while waiting for Provenance events to be re-indexed", e);
break;
}
}
try {
eventIndex.commitChanges(partitionName);
} catch (final IOException e) {
logger.error("Failed to re-index Provenance Events for partition " + partitionName, e);
}
executor.shutdown();
final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
final long seconds = millis / 1000L;
final long millisRemainder = millis % 1000L;
logger.info("Finished re-indexing {} events across {} files for {} in {}.{} seconds", reindexedCount.get(), eventFilesToReindex.size(), partitionDirectory, seconds, millisRemainder);
}
use of org.apache.nifi.provenance.serialization.RecordReader in project nifi by apache.
the class WriteAheadStorePartition method initialize.
@Override
public synchronized void initialize() throws IOException {
if (!partitionDirectory.exists()) {
Files.createDirectories(partitionDirectory.toPath());
}
final File[] files = partitionDirectory.listFiles(DirectoryUtils.EVENT_FILE_FILTER);
if (files == null) {
throw new IOException("Could not access files in the " + partitionDirectory + " directory");
}
// We need to determine what the largest Event ID is in this partition. To do this, we
// iterate over all files starting with the file that has the greatest ID, and try to find
// the largest Event ID in that file. Once we successfully determine the greatest Event ID
// in any one of the files, we are done, since we are iterating over the files in order of
// the Largest Event ID to the smallest.
long maxEventId = -1L;
final List<File> fileList = Arrays.asList(files);
Collections.sort(fileList, DirectoryUtils.LARGEST_ID_FIRST);
for (final File file : fileList) {
try {
final RecordReader reader = recordReaderFactory.newRecordReader(file, Collections.emptyList(), Integer.MAX_VALUE);
final long eventId = reader.getMaxEventId();
if (eventId > maxEventId) {
maxEventId = eventId;
break;
}
} catch (final Exception e) {
logger.warn("Could not read file {}; if this file contains Provenance Events, new events may be created with the same event identifiers", file, e);
}
}
synchronized (minEventIdToPathMap) {
for (final File file : fileList) {
final long minEventId = DirectoryUtils.getMinId(file);
minEventIdToPathMap.put(minEventId, file);
}
}
this.maxEventId.set(maxEventId);
// If configured to compress, compress any files that are not yet compressed.
if (config.isCompressOnRollover()) {
final File[] uncompressedFiles = partitionDirectory.listFiles(f -> f.getName().endsWith(".prov"));
if (uncompressedFiles != null) {
for (final File file : uncompressedFiles) {
// If we have both a compressed file and an uncompressed file for the same .prov file, then
// we must have been in the process of compressing it when NiFi was restarted. Delete the partial
// .gz file and we will start compressing it again.
final File compressed = new File(file.getParentFile(), file.getName() + ".gz");
if (compressed.exists()) {
compressed.delete();
}
}
}
}
// Update the ID Generator to the max of the ID Generator or maxEventId
final long nextPartitionId = maxEventId + 1;
final long updatedId = idGenerator.updateAndGet(curVal -> Math.max(curVal, nextPartitionId));
logger.info("After recovering {}, next Event ID to be generated will be {}", partitionDirectory, updatedId);
}
use of org.apache.nifi.provenance.serialization.RecordReader in project nifi-minifi by apache.
the class MiNiFiPersistentProvenanceRepository method purgeExpiredIndexes.
private void purgeExpiredIndexes() throws IOException {
// Now that we have potentially removed expired Provenance Event Log Files, we can look at
// whether or not we can delete any of the indexes. An index can be deleted if all of the
// data that is associated with that index has already been deleted. In order to test this,
// we will get the timestamp of the earliest event and then compare that to the latest timestamp
// that would be indexed by the earliest index. If the event occurred after the timestamp of
// the latest index, then we can just delete the entire index all together.
// find all of the index directories
final List<File> indexDirs = getAllIndexDirectories();
if (indexDirs.size() < 2) {
this.firstEventTimestamp = determineFirstEventTimestamp();
return;
}
// Indexes are named "index-XXX" where the XXX is the timestamp of the earliest event that
// could be in the index. Once we have finished with one index, we move on to another index,
// but we don't move on until we are finished with the previous index.
// Therefore, an efficient way to determine the latest timestamp of one index is to look at the
// timestamp of the next index (these could potentially overlap for one millisecond). This is
// efficient because we can determine the earliest timestamp of an index simply by looking at
// the name of the Index's directory.
final long latestTimestampOfFirstIndex = getIndexTimestamp(indexDirs.get(1));
// Get the timestamp of the first event in the first Provenance Event Log File and the ID of the last event
// in the event file.
final List<File> logFiles = getSortedLogFiles();
if (logFiles.isEmpty()) {
this.firstEventTimestamp = System.currentTimeMillis();
return;
}
final File firstLogFile = logFiles.get(0);
long earliestEventTime = System.currentTimeMillis();
long maxEventId = -1L;
try (final RecordReader reader = RecordReaders.newRecordReader(firstLogFile, null, Integer.MAX_VALUE)) {
final StandardProvenanceEventRecord event = reader.nextRecord();
earliestEventTime = event.getEventTime();
maxEventId = reader.getMaxEventId();
} catch (final IOException ioe) {
logger.warn("Unable to determine the maximum ID for Provenance Event Log File {}; values reported for the number of " + "events in the Provenance Repository may be inaccurate.", firstLogFile);
}
// check if we can delete the index safely.
if (latestTimestampOfFirstIndex <= earliestEventTime) {
// we can safely delete the first index because the latest event in the index is an event
// that has already been expired from the repository.
final File indexingDirectory = indexDirs.get(0);
getIndexManager().removeIndex(indexingDirectory);
indexConfig.removeIndexDirectory(indexingDirectory);
deleteDirectory(indexingDirectory);
if (maxEventId > -1L) {
indexConfig.setMinIdIndexed(maxEventId + 1L);
}
}
this.firstEventTimestamp = earliestEventTime;
}
use of org.apache.nifi.provenance.serialization.RecordReader in project nifi-minifi by apache.
the class MiNiFiPersistentProvenanceRepository method getEvents.
@Override
public List<ProvenanceEventRecord> getEvents(final long firstRecordId, final int maxRecords, final NiFiUser user) throws IOException {
final List<ProvenanceEventRecord> records = new ArrayList<>(maxRecords);
final List<Path> paths = getPathsForId(firstRecordId);
if (paths == null || paths.isEmpty()) {
return records;
}
for (final Path path : paths) {
try (RecordReader reader = RecordReaders.newRecordReader(path.toFile(), getAllLogFiles(), maxAttributeChars)) {
// just to get to the first record that we want.
if (records.isEmpty()) {
final TocReader tocReader = reader.getTocReader();
if (tocReader != null) {
final Integer blockIndex = tocReader.getBlockIndexForEventId(firstRecordId);
if (blockIndex != null) {
reader.skipToBlock(blockIndex);
}
}
}
StandardProvenanceEventRecord record;
while (records.size() < maxRecords && (record = reader.nextRecord()) != null) {
if (record.getEventId() >= firstRecordId && isAuthorized(record, user)) {
records.add(record);
}
}
} catch (final EOFException | FileNotFoundException fnfe) {
// assume file aged off (or there's no data in file, in case of EOFException, which indicates that data was cached
// in operating system and entire O/S crashed and always.sync was not turned on.)
} catch (final IOException ioe) {
logger.error("Failed to read Provenance Event File {} due to {}", path.toFile(), ioe.toString());
logger.error("", ioe);
eventReporter.reportEvent(Severity.ERROR, EVENT_CATEGORY, "Failed to read Provenance Event File " + path.toFile() + " due to " + ioe.toString());
}
if (records.size() >= maxRecords) {
break;
}
}
if (logger.isDebugEnabled()) {
logger.debug("Retrieving up to {} records starting at Event ID {}; returning {} events", maxRecords, firstRecordId, records.size());
}
return records;
}
use of org.apache.nifi.provenance.serialization.RecordReader in project nifi-minifi by apache.
the class MiNiFiPersistentProvenanceRepository method recover.
private void recover() throws IOException {
long maxId = -1L;
long maxIndexedId = -1L;
long minIndexedId = Long.MAX_VALUE;
final List<File> filesToRecover = new ArrayList<>();
for (final File file : configuration.getStorageDirectories().values()) {
final File[] matchingFiles = file.listFiles(new FileFilter() {
@Override
public boolean accept(final File pathname) {
final String filename = pathname.getName();
if (!filename.contains(FILE_EXTENSION) || filename.endsWith(TEMP_FILE_SUFFIX)) {
return false;
}
final String baseFilename = filename.substring(0, filename.indexOf("."));
return NUMBER_PATTERN.matcher(baseFilename).matches();
}
});
for (final File matchingFile : matchingFiles) {
filesToRecover.add(matchingFile);
}
}
final SortedMap<Long, Path> sortedPathMap = new TreeMap<>(new Comparator<Long>() {
@Override
public int compare(final Long o1, final Long o2) {
return Long.compare(o1, o2);
}
});
File maxIdFile = null;
for (final File file : filesToRecover) {
final String filename = file.getName();
final String baseName = filename.substring(0, filename.indexOf("."));
final long firstId = Long.parseLong(baseName);
sortedPathMap.put(firstId, file.toPath());
if (firstId > maxId) {
maxId = firstId;
maxIdFile = file;
}
if (firstId > maxIndexedId) {
maxIndexedId = firstId - 1;
}
if (firstId < minIndexedId) {
minIndexedId = firstId;
}
}
if (maxIdFile != null) {
// Determine the max ID in the last file.
try (final RecordReader reader = RecordReaders.newRecordReader(maxIdFile, getAllLogFiles(), maxAttributeChars)) {
final long eventId = reader.getMaxEventId();
if (eventId > maxId) {
maxId = eventId;
}
// update the max indexed id
if (eventId > maxIndexedId) {
maxIndexedId = eventId;
}
} catch (final IOException ioe) {
logger.error("Failed to read Provenance Event File {} due to {}", maxIdFile, ioe);
logger.error("", ioe);
}
}
if (maxIndexedId > -1L) {
// If we have indexed anything then set the min/max ID's indexed.
indexConfig.setMaxIdIndexed(maxIndexedId);
}
if (minIndexedId < Long.MAX_VALUE) {
indexConfig.setMinIdIndexed(minIndexedId);
}
idGenerator.set(maxId + 1);
try {
final Set<File> recoveredJournals = recoverJournalFiles();
filesToRecover.addAll(recoveredJournals);
// Find the file that has the greatest ID
File greatestMinIdFile = null;
long greatestMinId = 0L;
for (final File recoveredJournal : recoveredJournals) {
// if the file was removed because the journals were empty, don't count it
if (!recoveredJournal.exists()) {
continue;
}
final String basename = LuceneUtil.substringBefore(recoveredJournal.getName(), ".");
try {
final long minId = Long.parseLong(basename);
sortedPathMap.put(minId, recoveredJournal.toPath());
if (greatestMinIdFile == null || minId > greatestMinId) {
greatestMinId = minId;
greatestMinIdFile = recoveredJournal;
}
} catch (final NumberFormatException nfe) {
// not a file we care about...
}
}
// Read the records in the last file to find its max id
if (greatestMinIdFile != null) {
try (final RecordReader recordReader = RecordReaders.newRecordReader(greatestMinIdFile, Collections.<Path>emptyList(), maxAttributeChars)) {
maxId = recordReader.getMaxEventId();
}
}
// set the ID Generator 1 greater than the max id
idGenerator.set(maxId + 1);
} catch (final IOException ioe) {
logger.error("Failed to recover Journal Files due to {}", ioe.toString());
logger.error("", ioe);
}
idToPathMap.set(Collections.unmodifiableSortedMap(sortedPathMap));
logger.trace("In recovery, path map: {}", sortedPathMap);
final long recordsRecovered;
if (minIndexedId < Long.MAX_VALUE) {
recordsRecovered = idGenerator.get() - minIndexedId;
} else {
recordsRecovered = idGenerator.get();
}
logger.info("Recovered {} records", recordsRecovered);
recoveryFinished.set(true);
}
Aggregations