Example 16 with RecordReader

use of org.apache.nifi.provenance.serialization.RecordReader in project nifi by apache.

the class WriteAheadStorePartition method reindexLatestEvents.

void reindexLatestEvents(final EventIndex eventIndex) {
    final List<File> eventFiles = getEventFilesFromDisk().sorted(DirectoryUtils.SMALLEST_ID_FIRST).collect(Collectors.toList());
    if (eventFiles.isEmpty()) {
    final long minEventIdToReindex = eventIndex.getMinimumEventIdToReindex(partitionName);
    final long maxEventId = getMaxEventId();
    final long eventsToReindex = maxEventId - minEventIdToReindex;"The last Provenance Event indexed for partition {} is {}, but the last event written to partition has ID {}. " + "Re-indexing up to the last {} events to ensure that the Event Index is accurate and up-to-date", partitionName, minEventIdToReindex, maxEventId, eventsToReindex, partitionDirectory);
    // Find the first event file that we care about.
    int firstEventFileIndex = 0;
    for (int i = eventFiles.size() - 1; i >= 0; i--) {
        final File eventFile = eventFiles.get(i);
        final long minIdInFile = DirectoryUtils.getMinId(eventFile);
        if (minIdInFile <= minEventIdToReindex) {
            firstEventFileIndex = i;
    // Create a subList that contains the files of interest
    final List<File> eventFilesToReindex = eventFiles.subList(firstEventFileIndex, eventFiles.size());
    final ExecutorService executor = Executors.newFixedThreadPool(Math.min(4, eventFilesToReindex.size()), new NamedThreadFactory("Re-Index Provenance Events", true));
    final List<Future<?>> futures = new ArrayList<>(eventFilesToReindex.size());
    final AtomicLong reindexedCount = new AtomicLong(0L);
    // Re-Index the last bunch of events.
    // We don't use an Event Iterator here because it's possible that one of the event files could be corrupt (for example, if NiFi does while
    // writing to the file, a record may be incomplete). We don't want to prevent us from moving on and continuing to index the rest of the
    // un-indexed events. So we just use a List of files and create a reader for each one.
    final long start = System.nanoTime();
    int fileCount = 0;
    for (final File eventFile : eventFilesToReindex) {
        final boolean skipToEvent;
        if (fileCount++ == 0) {
            skipToEvent = true;
        } else {
            skipToEvent = false;
        final Runnable reindexTask = new Runnable() {

            public void run() {
                final Map<ProvenanceEventRecord, StorageSummary> storageMap = new HashMap<>(1000);
                try (final RecordReader recordReader = recordReaderFactory.newRecordReader(eventFile, Collections.emptyList(), Integer.MAX_VALUE)) {
                    if (skipToEvent) {
                        final Optional<ProvenanceEventRecord> eventOption = recordReader.skipToEvent(minEventIdToReindex);
                        if (!eventOption.isPresent()) {
                    StandardProvenanceEventRecord event = null;
                    while (true) {
                        final long startBytesConsumed = recordReader.getBytesConsumed();
                        event = recordReader.nextRecord();
                        if (event == null) {
                            // stop reading from this file
                        } else {
                            final long eventSize = recordReader.getBytesConsumed() - startBytesConsumed;
                            storageMap.put(event, new StorageSummary(event.getEventId(), eventFile.getName(), partitionName, recordReader.getBlockIndex(), eventSize, 0L));
                            if (storageMap.size() == 1000) {
                } catch (final EOFException eof) {
                    // Ran out of data. Continue on.
                    logger.warn("Failed to find event with ID {} in Event File {} due to {}", minEventIdToReindex, eventFile, eof.toString());
                } catch (final Exception e) {
                    logger.error("Failed to index Provenance Events found in {}", eventFile, e);
    for (final Future<?> future : futures) {
        try {
        } catch (final ExecutionException ee) {
            logger.error("Failed to re-index some Provenance events. These events may not be query-able via the Provenance interface", ee.getCause());
        } catch (final InterruptedException e) {
            logger.error("Interrupted while waiting for Provenance events to be re-indexed", e);
    try {
    } catch (final IOException e) {
        logger.error("Failed to re-index Provenance Events for partition " + partitionName, e);
    final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
    final long seconds = millis / 1000L;
    final long millisRemainder = millis % 1000L;"Finished re-indexing {} events across {} files for {} in {}.{} seconds", reindexedCount.get(), eventFilesToReindex.size(), partitionDirectory, seconds, millisRemainder);
Also used : HashMap(java.util.HashMap) RecordReader(org.apache.nifi.provenance.serialization.RecordReader) ArrayList(java.util.ArrayList) StandardProvenanceEventRecord(org.apache.nifi.provenance.StandardProvenanceEventRecord) StorageSummary(org.apache.nifi.provenance.serialization.StorageSummary) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) StandardProvenanceEventRecord(org.apache.nifi.provenance.StandardProvenanceEventRecord) EOFException( ExecutionException(java.util.concurrent.ExecutionException) NamedThreadFactory(org.apache.nifi.provenance.util.NamedThreadFactory) IOException( IOException( EOFException( ExecutionException(java.util.concurrent.ExecutionException) AtomicLong(java.util.concurrent.atomic.AtomicLong) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) File(

Example 17 with RecordReader

use of org.apache.nifi.provenance.serialization.RecordReader in project nifi by apache.

the class WriteAheadStorePartition method initialize.

public synchronized void initialize() throws IOException {
    if (!partitionDirectory.exists()) {
    final File[] files = partitionDirectory.listFiles(DirectoryUtils.EVENT_FILE_FILTER);
    if (files == null) {
        throw new IOException("Could not access files in the " + partitionDirectory + " directory");
    // We need to determine what the largest Event ID is in this partition. To do this, we
    // iterate over all files starting with the file that has the greatest ID, and try to find
    // the largest Event ID in that file. Once we successfully determine the greatest Event ID
    // in any one of the files, we are done, since we are iterating over the files in order of
    // the Largest Event ID to the smallest.
    long maxEventId = -1L;
    final List<File> fileList = Arrays.asList(files);
    Collections.sort(fileList, DirectoryUtils.LARGEST_ID_FIRST);
    for (final File file : fileList) {
        try {
            final RecordReader reader = recordReaderFactory.newRecordReader(file, Collections.emptyList(), Integer.MAX_VALUE);
            final long eventId = reader.getMaxEventId();
            if (eventId > maxEventId) {
                maxEventId = eventId;
        } catch (final Exception e) {
            logger.warn("Could not read file {}; if this file contains Provenance Events, new events may be created with the same event identifiers", file, e);
    synchronized (minEventIdToPathMap) {
        for (final File file : fileList) {
            final long minEventId = DirectoryUtils.getMinId(file);
            minEventIdToPathMap.put(minEventId, file);
    // If configured to compress, compress any files that are not yet compressed.
    if (config.isCompressOnRollover()) {
        final File[] uncompressedFiles = partitionDirectory.listFiles(f -> f.getName().endsWith(".prov"));
        if (uncompressedFiles != null) {
            for (final File file : uncompressedFiles) {
                // If we have both a compressed file and an uncompressed file for the same .prov file, then
                // we must have been in the process of compressing it when NiFi was restarted. Delete the partial
                // .gz file and we will start compressing it again.
                final File compressed = new File(file.getParentFile(), file.getName() + ".gz");
                if (compressed.exists()) {
    // Update the ID Generator to the max of the ID Generator or maxEventId
    final long nextPartitionId = maxEventId + 1;
    final long updatedId = idGenerator.updateAndGet(curVal -> Math.max(curVal, nextPartitionId));"After recovering {}, next Event ID to be generated will be {}", partitionDirectory, updatedId);
Also used : RecordReader(org.apache.nifi.provenance.serialization.RecordReader) IOException( File( IOException( EOFException( ExecutionException(java.util.concurrent.ExecutionException)

Example 18 with RecordReader

use of org.apache.nifi.provenance.serialization.RecordReader in project nifi-minifi by apache.

the class MiNiFiPersistentProvenanceRepository method purgeExpiredIndexes.

private void purgeExpiredIndexes() throws IOException {
    // Now that we have potentially removed expired Provenance Event Log Files, we can look at
    // whether or not we can delete any of the indexes. An index can be deleted if all of the
    // data that is associated with that index has already been deleted. In order to test this,
    // we will get the timestamp of the earliest event and then compare that to the latest timestamp
    // that would be indexed by the earliest index. If the event occurred after the timestamp of
    // the latest index, then we can just delete the entire index all together.
    // find all of the index directories
    final List<File> indexDirs = getAllIndexDirectories();
    if (indexDirs.size() < 2) {
        this.firstEventTimestamp = determineFirstEventTimestamp();
    // Indexes are named "index-XXX" where the XXX is the timestamp of the earliest event that
    // could be in the index. Once we have finished with one index, we move on to another index,
    // but we don't move on until we are finished with the previous index.
    // Therefore, an efficient way to determine the latest timestamp of one index is to look at the
    // timestamp of the next index (these could potentially overlap for one millisecond). This is
    // efficient because we can determine the earliest timestamp of an index simply by looking at
    // the name of the Index's directory.
    final long latestTimestampOfFirstIndex = getIndexTimestamp(indexDirs.get(1));
    // Get the timestamp of the first event in the first Provenance Event Log File and the ID of the last event
    // in the event file.
    final List<File> logFiles = getSortedLogFiles();
    if (logFiles.isEmpty()) {
        this.firstEventTimestamp = System.currentTimeMillis();
    final File firstLogFile = logFiles.get(0);
    long earliestEventTime = System.currentTimeMillis();
    long maxEventId = -1L;
    try (final RecordReader reader = RecordReaders.newRecordReader(firstLogFile, null, Integer.MAX_VALUE)) {
        final StandardProvenanceEventRecord event = reader.nextRecord();
        earliestEventTime = event.getEventTime();
        maxEventId = reader.getMaxEventId();
    } catch (final IOException ioe) {
        logger.warn("Unable to determine the maximum ID for Provenance Event Log File {}; values reported for the number of " + "events in the Provenance Repository may be inaccurate.", firstLogFile);
    // check if we can delete the index safely.
    if (latestTimestampOfFirstIndex <= earliestEventTime) {
        // we can safely delete the first index because the latest event in the index is an event
        // that has already been expired from the repository.
        final File indexingDirectory = indexDirs.get(0);
        if (maxEventId > -1L) {
            indexConfig.setMinIdIndexed(maxEventId + 1L);
    this.firstEventTimestamp = earliestEventTime;
Also used : RecordReader(org.apache.nifi.provenance.serialization.RecordReader) IOException( File(

Example 19 with RecordReader

use of org.apache.nifi.provenance.serialization.RecordReader in project nifi-minifi by apache.

the class MiNiFiPersistentProvenanceRepository method getEvents.

public List<ProvenanceEventRecord> getEvents(final long firstRecordId, final int maxRecords, final NiFiUser user) throws IOException {
    final List<ProvenanceEventRecord> records = new ArrayList<>(maxRecords);
    final List<Path> paths = getPathsForId(firstRecordId);
    if (paths == null || paths.isEmpty()) {
        return records;
    for (final Path path : paths) {
        try (RecordReader reader = RecordReaders.newRecordReader(path.toFile(), getAllLogFiles(), maxAttributeChars)) {
            // just to get to the first record that we want.
            if (records.isEmpty()) {
                final TocReader tocReader = reader.getTocReader();
                if (tocReader != null) {
                    final Integer blockIndex = tocReader.getBlockIndexForEventId(firstRecordId);
                    if (blockIndex != null) {
            StandardProvenanceEventRecord record;
            while (records.size() < maxRecords && (record = reader.nextRecord()) != null) {
                if (record.getEventId() >= firstRecordId && isAuthorized(record, user)) {
        } catch (final EOFException | FileNotFoundException fnfe) {
        // assume file aged off (or there's no data in file, in case of EOFException, which indicates that data was cached
        // in operating system and entire O/S crashed and always.sync was not turned on.)
        } catch (final IOException ioe) {
            logger.error("Failed to read Provenance Event File {} due to {}", path.toFile(), ioe.toString());
            logger.error("", ioe);
            eventReporter.reportEvent(Severity.ERROR, EVENT_CATEGORY, "Failed to read Provenance Event File " + path.toFile() + " due to " + ioe.toString());
        if (records.size() >= maxRecords) {
    if (logger.isDebugEnabled()) {
        logger.debug("Retrieving up to {} records starting at Event ID {}; returning {} events", maxRecords, firstRecordId, records.size());
    return records;
Also used : Path(java.nio.file.Path) TocReader(org.apache.nifi.provenance.toc.TocReader) RecordReader(org.apache.nifi.provenance.serialization.RecordReader) ArrayList(java.util.ArrayList) FileNotFoundException( IOException( AtomicInteger(java.util.concurrent.atomic.AtomicInteger) EOFException(

Example 20 with RecordReader

use of org.apache.nifi.provenance.serialization.RecordReader in project nifi-minifi by apache.

the class MiNiFiPersistentProvenanceRepository method recover.

private void recover() throws IOException {
    long maxId = -1L;
    long maxIndexedId = -1L;
    long minIndexedId = Long.MAX_VALUE;
    final List<File> filesToRecover = new ArrayList<>();
    for (final File file : configuration.getStorageDirectories().values()) {
        final File[] matchingFiles = file.listFiles(new FileFilter() {

            public boolean accept(final File pathname) {
                final String filename = pathname.getName();
                if (!filename.contains(FILE_EXTENSION) || filename.endsWith(TEMP_FILE_SUFFIX)) {
                    return false;
                final String baseFilename = filename.substring(0, filename.indexOf("."));
                return NUMBER_PATTERN.matcher(baseFilename).matches();
        for (final File matchingFile : matchingFiles) {
    final SortedMap<Long, Path> sortedPathMap = new TreeMap<>(new Comparator<Long>() {

        public int compare(final Long o1, final Long o2) {
            return, o2);
    File maxIdFile = null;
    for (final File file : filesToRecover) {
        final String filename = file.getName();
        final String baseName = filename.substring(0, filename.indexOf("."));
        final long firstId = Long.parseLong(baseName);
        sortedPathMap.put(firstId, file.toPath());
        if (firstId > maxId) {
            maxId = firstId;
            maxIdFile = file;
        if (firstId > maxIndexedId) {
            maxIndexedId = firstId - 1;
        if (firstId < minIndexedId) {
            minIndexedId = firstId;
    if (maxIdFile != null) {
        // Determine the max ID in the last file.
        try (final RecordReader reader = RecordReaders.newRecordReader(maxIdFile, getAllLogFiles(), maxAttributeChars)) {
            final long eventId = reader.getMaxEventId();
            if (eventId > maxId) {
                maxId = eventId;
            // update the max indexed id
            if (eventId > maxIndexedId) {
                maxIndexedId = eventId;
        } catch (final IOException ioe) {
            logger.error("Failed to read Provenance Event File {} due to {}", maxIdFile, ioe);
            logger.error("", ioe);
    if (maxIndexedId > -1L) {
        // If we have indexed anything then set the min/max ID's indexed.
    if (minIndexedId < Long.MAX_VALUE) {
    idGenerator.set(maxId + 1);
    try {
        final Set<File> recoveredJournals = recoverJournalFiles();
        // Find the file that has the greatest ID
        File greatestMinIdFile = null;
        long greatestMinId = 0L;
        for (final File recoveredJournal : recoveredJournals) {
            // if the file was removed because the journals were empty, don't count it
            if (!recoveredJournal.exists()) {
            final String basename = LuceneUtil.substringBefore(recoveredJournal.getName(), ".");
            try {
                final long minId = Long.parseLong(basename);
                sortedPathMap.put(minId, recoveredJournal.toPath());
                if (greatestMinIdFile == null || minId > greatestMinId) {
                    greatestMinId = minId;
                    greatestMinIdFile = recoveredJournal;
            } catch (final NumberFormatException nfe) {
            // not a file we care about...
        // Read the records in the last file to find its max id
        if (greatestMinIdFile != null) {
            try (final RecordReader recordReader = RecordReaders.newRecordReader(greatestMinIdFile, Collections.<Path>emptyList(), maxAttributeChars)) {
                maxId = recordReader.getMaxEventId();
        // set the ID Generator 1 greater than the max id
        idGenerator.set(maxId + 1);
    } catch (final IOException ioe) {
        logger.error("Failed to recover Journal Files due to {}", ioe.toString());
        logger.error("", ioe);
    logger.trace("In recovery, path map: {}", sortedPathMap);
    final long recordsRecovered;
    if (minIndexedId < Long.MAX_VALUE) {
        recordsRecovered = idGenerator.get() - minIndexedId;
    } else {
        recordsRecovered = idGenerator.get();
    }"Recovered {} records", recordsRecovered);
Also used : Path(java.nio.file.Path) RecordReader(org.apache.nifi.provenance.serialization.RecordReader) ArrayList(java.util.ArrayList) IOException( TreeMap(java.util.TreeMap) TimestampedLong(org.apache.nifi.util.timebuffer.TimestampedLong) AtomicLong(java.util.concurrent.atomic.AtomicLong) FileFilter( File(


