Search in sources :

Example 11 with RecordWriter

use of org.apache.nifi.provenance.serialization.RecordWriter in project nifi by apache.

the class TestSelectiveRecordReaderEventIterator method testPerformanceOfRandomAccessReads.

@Test
@Ignore("For local testing only. Runs indefinitely")
public void testPerformanceOfRandomAccessReads() throws Exception {
    final File dir = new File("target/storage/" + UUID.randomUUID().toString());
    final File journalFile = new File(dir, "/4.prov.gz");
    final File tocFile = TocUtil.getTocFile(journalFile);
    final int blockSize = 1024 * 32;
    try (final RecordWriter writer = createWriter(journalFile, new StandardTocWriter(tocFile, true, false), true, blockSize)) {
        writer.writeHeader(0L);
        for (int i = 0; i < 100_000; i++) {
            writer.writeRecord(TestUtil.createEvent());
        }
    }
    final Long[] eventIds = new Long[] { 4L, 80L, 1024L, 1025L, 1026L, 1027L, 1028L, 1029L, 1030L, 40_000L, 80_000L, 99_000L };
    final RecordReaderFactory readerFactory = (file, logs, maxChars) -> RecordReaders.newRecordReader(file, logs, maxChars);
    final List<File> files = new ArrayList<>();
    files.add(new File(dir, "0.prov"));
    files.add(new File(dir, "0.prov"));
    files.add(new File(dir, "1.prov"));
    files.add(new File(dir, "2.prov"));
    files.add(new File(dir, "3.prov"));
    files.add(journalFile);
    files.add(new File(dir, "100000000.prov"));
    boolean loopForever = true;
    while (loopForever) {
        final long start = System.nanoTime();
        for (int i = 0; i < 1000; i++) {
            final SelectiveRecordReaderEventIterator iterator = new SelectiveRecordReaderEventIterator(Collections.singletonList(journalFile), readerFactory, Arrays.asList(eventIds), 32 * 1024);
            for (final long id : eventIds) {
                time(() -> {
                    return iterator.nextEvent().orElse(null);
                }, id);
            }
        }
        final long ms = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
        System.out.println(ms + " ms total");
    }
}
Also used : Arrays(java.util.Arrays) StandardTocWriter(org.apache.nifi.provenance.toc.StandardTocWriter) IdentifierLookup(org.apache.nifi.provenance.IdentifierLookup) Callable(java.util.concurrent.Callable) RecordWriter(org.apache.nifi.provenance.serialization.RecordWriter) RecordReaders(org.apache.nifi.provenance.serialization.RecordReaders) ArrayList(java.util.ArrayList) TocWriter(org.apache.nifi.provenance.toc.TocWriter) ProvenanceEventRecord(org.apache.nifi.provenance.ProvenanceEventRecord) RecordReaderFactory(org.apache.nifi.provenance.store.RecordReaderFactory) TocUtil(org.apache.nifi.provenance.toc.TocUtil) IOException(java.io.IOException) Test(org.junit.Test) UUID(java.util.UUID) File(java.io.File) TimeUnit(java.util.concurrent.TimeUnit) AtomicLong(java.util.concurrent.atomic.AtomicLong) List(java.util.List) Ignore(org.junit.Ignore) Assert.assertFalse(org.junit.Assert.assertFalse) Optional(java.util.Optional) TestUtil(org.apache.nifi.provenance.TestUtil) Assert(org.junit.Assert) Collections(java.util.Collections) EventIdFirstSchemaRecordWriter(org.apache.nifi.provenance.EventIdFirstSchemaRecordWriter) Assert.assertEquals(org.junit.Assert.assertEquals) ArrayList(java.util.ArrayList) RecordReaderFactory(org.apache.nifi.provenance.store.RecordReaderFactory) StandardTocWriter(org.apache.nifi.provenance.toc.StandardTocWriter) RecordWriter(org.apache.nifi.provenance.serialization.RecordWriter) EventIdFirstSchemaRecordWriter(org.apache.nifi.provenance.EventIdFirstSchemaRecordWriter) AtomicLong(java.util.concurrent.atomic.AtomicLong) File(java.io.File) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 12 with RecordWriter

use of org.apache.nifi.provenance.serialization.RecordWriter in project nifi by apache.

the class PersistentProvenanceRepository method rollover.

/**
 * <p>
 * MUST be called with the write lock held.
 * </p>
 *
 * Rolls over the data in the journal files, merging them into a single
 * Provenance Event Log File, and compressing and indexing as needed.
 *
 * @param force if true, will force a rollover regardless of whether or not
 * data has been written
 * @throws IOException if unable to complete rollover
 */
private void rollover(final boolean force) throws IOException {
    if (!configuration.isAllowRollover()) {
        return;
    }
    // have written something to the stream, then roll over
    if (force || recordsWrittenSinceRollover.get() > 0L || dirtyWriterCount.get() > 0) {
        final List<File> journalsToMerge = new ArrayList<>();
        for (final RecordWriter writer : writers) {
            if (!writer.isClosed()) {
                final File writerFile = writer.getFile();
                journalsToMerge.add(writerFile);
                try {
                    writer.close();
                } catch (final IOException ioe) {
                    logger.warn("Failed to close {} due to {}", writer, ioe.toString());
                    if (logger.isDebugEnabled()) {
                        logger.warn("", ioe);
                    }
                }
            }
        }
        if (logger.isDebugEnabled()) {
            if (journalsToMerge.isEmpty()) {
                logger.debug("No journals to merge; all RecordWriters were already closed");
            } else {
                logger.debug("Going to merge {} files for journals starting with ID {}", journalsToMerge.size(), LuceneUtil.substringBefore(journalsToMerge.get(0).getName(), "."));
            }
        }
        // Choose a storage directory to store the merged file in.
        final long storageDirIdx = storageDirectoryIndex.getAndIncrement();
        final List<File> storageDirs = new ArrayList<>(configuration.getStorageDirectories().values());
        final File storageDir = storageDirs.get((int) (storageDirIdx % storageDirs.size()));
        Future<?> future = null;
        if (!journalsToMerge.isEmpty()) {
            // Run the rollover logic in a background thread.
            final AtomicReference<Future<?>> futureReference = new AtomicReference<>();
            final AtomicInteger retryAttempts = new AtomicInteger(MAX_JOURNAL_ROLLOVER_RETRIES);
            final int recordsWritten = recordsWrittenSinceRollover.getAndSet(0);
            final Runnable rolloverRunnable = new Runnable() {

                @Override
                public void run() {
                    File fileRolledOver = null;
                    try {
                        try {
                            fileRolledOver = mergeJournals(journalsToMerge, getMergeFile(journalsToMerge, storageDir), eventReporter);
                        } catch (final IOException ioe) {
                            logger.error("Failed to merge Journal Files {} into a Provenance Log File due to {}", journalsToMerge, ioe.toString());
                            logger.error("", ioe);
                        }
                        if (fileRolledOver != null) {
                            final File file = fileRolledOver;
                            // update our map of id to Path
                            // We need to make sure that another thread doesn't also update the map at the same time. We cannot
                            // use the write lock when purging old events, and we want to use the same approach here.
                            boolean updated = false;
                            final Long fileFirstEventId = Long.valueOf(LuceneUtil.substringBefore(fileRolledOver.getName(), "."));
                            while (!updated) {
                                final SortedMap<Long, Path> existingPathMap = idToPathMap.get();
                                final SortedMap<Long, Path> newIdToPathMap = new TreeMap<>(new PathMapComparator());
                                newIdToPathMap.putAll(existingPathMap);
                                newIdToPathMap.put(fileFirstEventId, file.toPath());
                                updated = idToPathMap.compareAndSet(existingPathMap, newIdToPathMap);
                            }
                            final TimedCountSize countSize = updateCounts.getAggregateValue(System.currentTimeMillis() - TimeUnit.MILLISECONDS.convert(5, TimeUnit.MINUTES));
                            logger.info("Successfully Rolled over Provenance Event file containing {} records. In the past 5 minutes, " + "{} events have been written to the Provenance Repository, totaling {}", recordsWritten, countSize.getCount(), FormatUtils.formatDataSize(countSize.getSize()));
                        }
                        // if files were rolled over or if out of retries stop the future
                        if (fileRolledOver != null || retryAttempts.decrementAndGet() == 0) {
                            if (fileRolledOver == null && retryAttempts.get() == 0) {
                                logger.error("Failed to merge Journal Files {} after {} attempts.", journalsToMerge, MAX_JOURNAL_ROLLOVER_RETRIES);
                            }
                            rolloverCompletions.getAndIncrement();
                            // Cancel the future so that we don't run anymore
                            Future<?> future;
                            while ((future = futureReference.get()) == null) {
                                try {
                                    Thread.sleep(10L);
                                } catch (final InterruptedException ie) {
                                }
                            }
                            future.cancel(false);
                        } else {
                            logger.warn("Couldn't merge journals. Will try again. journalsToMerge: {}, storageDir: {}", journalsToMerge, storageDir);
                        }
                    } catch (final Exception e) {
                        logger.error("Failed to merge journals. Will try again. journalsToMerge: {}, storageDir: {}, cause: {}", journalsToMerge, storageDir, e.toString());
                        logger.error("", e);
                    }
                }
            };
            // We are going to schedule the future to run immediately and then repeat every 10 seconds. This allows us to keep retrying if we
            // fail for some reason. When we succeed or if retries are exceeded, the Runnable will cancel itself.
            future = rolloverExecutor.scheduleWithFixedDelay(rolloverRunnable, 0, getRolloverRetryMillis(), TimeUnit.MILLISECONDS);
            futureReference.set(future);
        }
        streamStartTime.set(System.currentTimeMillis());
        bytesWrittenSinceRollover.set(0);
        // We don't want to create new 'writers' until the number of unmerged journals falls below our threshold. So we wait
        // here before we repopulate the 'writers' member variable and release the lock.
        int journalFileCount = getJournalCount();
        long repoSize = getSize(getLogFiles(), 0L);
        final int journalCountThreshold = configuration.getJournalCount() * 5;
        final long sizeThreshold = (long) (configuration.getMaxStorageCapacity() * ROLLOVER_HIGH_WATER);
        // that is no longer the case.
        if (journalFileCount > journalCountThreshold || repoSize > sizeThreshold) {
            final long stopTheWorldStart = System.nanoTime();
            logger.warn("The rate of the dataflow is exceeding the provenance recording rate. " + "Slowing down flow to accommodate. Currently, there are {} journal files ({} bytes) and " + "threshold for blocking is {} ({} bytes)", journalFileCount, repoSize, journalCountThreshold, sizeThreshold);
            eventReporter.reportEvent(Severity.WARNING, "Provenance Repository", "The rate of the dataflow is " + "exceeding the provenance recording rate. Slowing down flow to accommodate");
            while (journalFileCount > journalCountThreshold || repoSize > sizeThreshold) {
                // if a shutdown happens while we are in this loop, kill the rollover thread and break
                if (this.closed.get()) {
                    if (future != null) {
                        future.cancel(true);
                    }
                    break;
                }
                if (repoSize > sizeThreshold) {
                    logger.debug("Provenance Repository has exceeded its size threshold; will trigger purging of oldest events");
                    purgeOldEvents();
                    journalFileCount = getJournalCount();
                    repoSize = getSize(getLogFiles(), 0L);
                    continue;
                } else {
                    // due to the runnable that we scheduled above
                    try {
                        Thread.sleep(100L);
                    } catch (final InterruptedException ie) {
                    }
                }
                logger.debug("Provenance Repository is still behind. Keeping flow slowed down " + "to accommodate. Currently, there are {} journal files ({} bytes) and " + "threshold for blocking is {} ({} bytes)", journalFileCount, repoSize, journalCountThreshold, sizeThreshold);
                journalFileCount = getJournalCount();
                repoSize = getSize(getLogFiles(), 0L);
            }
            final long stopTheWorldNanos = System.nanoTime() - stopTheWorldStart;
            backpressurePauseMillis.add(new TimestampedLong(stopTheWorldNanos));
            final TimestampedLong pauseNanosLastFiveMinutes = backpressurePauseMillis.getAggregateValue(System.currentTimeMillis() - TimeUnit.MILLISECONDS.convert(5, TimeUnit.MINUTES));
            logger.info("Provenance Repository has now caught up with rolling over journal files. Current number of " + "journal files to be rolled over is {}. Provenance Repository Back Pressure paused Session commits for {} ({} total in the last 5 minutes).", journalFileCount, FormatUtils.formatNanos(stopTheWorldNanos, true), FormatUtils.formatNanos(pauseNanosLastFiveMinutes.getValue(), true));
        }
        // we've finished rolling over successfully. Create new writers and reset state.
        writers = createWriters(configuration, idGenerator.get());
        dirtyWriterCount.set(0);
        streamStartTime.set(System.currentTimeMillis());
        recordsWrittenSinceRollover.getAndSet(0);
    }
}
Also used : Path(java.nio.file.Path) ArrayList(java.util.ArrayList) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) TreeMap(java.util.TreeMap) TimedCountSize(org.apache.nifi.util.timebuffer.TimedCountSize) IndexNotFoundException(org.apache.lucene.index.IndexNotFoundException) ResourceNotFoundException(org.apache.nifi.web.ResourceNotFoundException) AccessDeniedException(org.apache.nifi.authorization.AccessDeniedException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) EOFException(java.io.EOFException) FileNotFoundException(java.io.FileNotFoundException) TimestampedLong(org.apache.nifi.util.timebuffer.TimestampedLong) RecordWriter(org.apache.nifi.provenance.serialization.RecordWriter) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) AtomicLong(java.util.concurrent.atomic.AtomicLong) TimestampedLong(org.apache.nifi.util.timebuffer.TimestampedLong) Future(java.util.concurrent.Future) File(java.io.File)

Example 13 with RecordWriter

use of org.apache.nifi.provenance.serialization.RecordWriter in project nifi by apache.

the class PersistentProvenanceRepository method persistRecord.

private void persistRecord(final Iterable<ProvenanceEventRecord> records) {
    final long totalJournalSize;
    readLock.lock();
    try {
        long bytesWritten = 0L;
        // obtain a lock on one of the RecordWriter's so that no other thread is able to write to this writer until we're finished.
        // Although the writer itself is thread-safe, we need to generate an event id and then write the event
        // atomically, so we need to do this with a lock.
        boolean locked = false;
        RecordWriter writer;
        do {
            final RecordWriter[] recordWriters = this.writers;
            final int numDirty = dirtyWriterCount.get();
            if (numDirty >= recordWriters.length) {
                throw new IllegalStateException("Cannot update repository because all partitions are unusable at this time. Writing to the repository would cause corruption. " + "This most often happens as a result of the repository running out of disk space or the JVM running out of memory.");
            }
            final long idx = writerIndex.getAndIncrement();
            writer = recordWriters[(int) (idx % recordWriters.length)];
            locked = writer.tryLock();
        } while (!locked);
        try {
            try {
                long recordsWritten = 0L;
                for (final ProvenanceEventRecord nextRecord : records) {
                    final StorageSummary persistedEvent = writer.writeRecord(nextRecord);
                    bytesWritten += persistedEvent.getSerializedLength();
                    recordsWritten++;
                    logger.trace("Wrote record with ID {} to {}", persistedEvent.getEventId(), writer);
                }
                writer.flush();
                if (alwaysSync) {
                    writer.sync();
                }
                totalJournalSize = bytesWrittenSinceRollover.addAndGet(bytesWritten);
                recordsWrittenSinceRollover.getAndIncrement();
                this.updateCounts.add(new TimedCountSize(recordsWritten, bytesWritten));
            } catch (final Throwable t) {
                // We need to set the repoDirty flag before we release the lock for this journal.
                // Otherwise, another thread may write to this journal -- this is a problem because
                // the journal contains part of our record but not all of it. Writing to the end of this
                // journal will result in corruption!
                writer.markDirty();
                dirtyWriterCount.incrementAndGet();
                // force rollover to happen soon.
                streamStartTime.set(0L);
                throw t;
            } finally {
                writer.unlock();
            }
        } catch (final IOException ioe) {
            // warn about the failure
            logger.error("Failed to persist Provenance Event due to {}.", ioe.toString());
            logger.error("", ioe);
            eventReporter.reportEvent(Severity.ERROR, EVENT_CATEGORY, "Failed to persist Provenance Event due to " + ioe.toString());
            // Attempt to perform a rollover. An IOException in this part of the code generally is the result of
            // running out of disk space. If we have multiple partitions, we may well be able to rollover. This helps
            // in two ways: it compresses the journal files which frees up space, and if it ends up merging to a different
            // partition/storage directory, we can delete the journals from this directory that ran out of space.
            // In order to do this, though, we must switch from a read lock to a write lock.
            // This part of the code gets a little bit messy, and we could potentially refactor it a bit in order to
            // make the code cleaner.
            readLock.unlock();
            try {
                writeLock.lock();
                try {
                    logger.debug("Obtained write lock to rollover due to IOException on write");
                    rollover(true);
                } finally {
                    writeLock.unlock();
                }
            } catch (final Exception e) {
                logger.error("Failed to Rollover Provenance Event Repository file due to {}", e.toString());
                logger.error("", e);
                eventReporter.reportEvent(Severity.ERROR, EVENT_CATEGORY, "Failed to Rollover Provenance Event Log due to " + e.toString());
            } finally {
                // we must re-lock the readLock, as the finally block below is going to unlock it.
                readLock.lock();
            }
            return;
        }
    } finally {
        readLock.unlock();
    }
    // If the total number of bytes written to the Journals is >= configured max, we need to roll over
    if (totalJournalSize >= configuration.getMaxEventFileCapacity()) {
        writeLock.lock();
        try {
            logger.debug("Obtained write lock to perform rollover based on file size");
            // another thread may have just done it.
            if (bytesWrittenSinceRollover.get() >= configuration.getMaxEventFileCapacity()) {
                try {
                    rollover(false);
                } catch (final IOException e) {
                    logger.error("Failed to Rollover Provenance Event Repository file due to {}", e.toString());
                    logger.error("", e);
                    eventReporter.reportEvent(Severity.ERROR, EVENT_CATEGORY, "Failed to Rollover Provenance Event Log due to " + e.toString());
                }
            }
        } finally {
            writeLock.unlock();
        }
    }
}
Also used : RecordWriter(org.apache.nifi.provenance.serialization.RecordWriter) StorageSummary(org.apache.nifi.provenance.serialization.StorageSummary) IOException(java.io.IOException) TimedCountSize(org.apache.nifi.util.timebuffer.TimedCountSize) IndexNotFoundException(org.apache.lucene.index.IndexNotFoundException) ResourceNotFoundException(org.apache.nifi.web.ResourceNotFoundException) AccessDeniedException(org.apache.nifi.authorization.AccessDeniedException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) EOFException(java.io.EOFException) FileNotFoundException(java.io.FileNotFoundException)

Example 14 with RecordWriter

use of org.apache.nifi.provenance.serialization.RecordWriter in project nifi by apache.

the class PersistentProvenanceRepository method createWriters.

// protected in order to override for unit tests
protected RecordWriter[] createWriters(final RepositoryConfiguration config, final long initialRecordId) throws IOException {
    final List<File> storageDirectories = new ArrayList<>(config.getStorageDirectories().values());
    final RecordWriter[] writers = new RecordWriter[config.getJournalCount()];
    for (int i = 0; i < config.getJournalCount(); i++) {
        final File storageDirectory = storageDirectories.get(i % storageDirectories.size());
        final File journalDirectory = new File(storageDirectory, "journals");
        final File journalFile = new File(journalDirectory, String.valueOf(initialRecordId) + ".journal." + i);
        writers[i] = RecordWriters.newSchemaRecordWriter(journalFile, idGenerator, false, false);
        writers[i].writeHeader(initialRecordId);
    }
    logger.info("Created new Provenance Event Writers for events starting with ID {}", initialRecordId);
    return writers;
}
Also used : RecordWriter(org.apache.nifi.provenance.serialization.RecordWriter) ArrayList(java.util.ArrayList) File(java.io.File)

Example 15 with RecordWriter

use of org.apache.nifi.provenance.serialization.RecordWriter in project nifi by apache.

the class PersistentProvenanceRepository method mergeJournals.

/**
 * <p>
 * Merges all of the given Journal Files into a single, merged Provenance
 * Event Log File. As these records are merged, they will be compressed, if
 * the repository is configured to compress records, and will be indexed.
 * </p>
 *
 * <p>
 * If the repository is configured to compress the data, the file written to
 * may not be the same as the <code>suggestedMergeFile</code>, as a filename
 * extension of '.gz' may be appended. If the journals are successfully
 * merged, the file that they were merged into will be returned. If unable
 * to merge the records (for instance, because the repository has been
 * closed or because the list of journal files was empty), this method will
 * return <code>null</code>.
 * </p>
 *
 * @param journalFiles the journal files to merge
 * @param suggestedMergeFile the file to write the merged records to
 * @param eventReporter the event reporter to report any warnings or errors
 * to; may be null.
 *
 * @return the file that the given journals were merged into, or
 * <code>null</code> if no records were merged.
 *
 * @throws IOException if a problem occurs writing to the mergedFile,
 * reading from a journal, or updating the Lucene Index.
 */
File mergeJournals(final List<File> journalFiles, final File suggestedMergeFile, final EventReporter eventReporter) throws IOException {
    logger.debug("Merging {} to {}", journalFiles, suggestedMergeFile);
    if (this.closed.get()) {
        logger.info("Provenance Repository has been closed; will not merge journal files to {}", suggestedMergeFile);
        return null;
    }
    if (journalFiles.isEmpty()) {
        logger.debug("Couldn't merge journals: Journal Files is empty; won't merge journals");
        return null;
    }
    Collections.sort(journalFiles, new Comparator<File>() {

        @Override
        public int compare(final File o1, final File o2) {
            final String suffix1 = LuceneUtil.substringAfterLast(o1.getName(), ".");
            final String suffix2 = LuceneUtil.substringAfterLast(o2.getName(), ".");
            try {
                final int journalIndex1 = Integer.parseInt(suffix1);
                final int journalIndex2 = Integer.parseInt(suffix2);
                return Integer.compare(journalIndex1, journalIndex2);
            } catch (final NumberFormatException nfe) {
                return o1.getName().compareTo(o2.getName());
            }
        }
    });
    // Search for any missing files. At this point they should have been written to disk otherwise cannot continue.
    // Missing files is most likely due to incomplete cleanup of files post merge
    final List<File> availableFiles = filterUnavailableFiles(journalFiles);
    final int numAvailableFiles = availableFiles.size();
    // check if we have all of the "partial" files for the journal.
    if (numAvailableFiles > 0) {
        if (suggestedMergeFile.exists()) {
            // we have all "partial" files and there is already a merged file. Delete the data from the index
            // because the merge file may not be fully merged. We will re-merge.
            logger.warn("Merged Journal File {} already exists; however, all partial journal files also exist " + "so assuming that the merge did not finish. Repeating procedure in order to ensure consistency.");
            final DeleteIndexAction deleteAction = new DeleteIndexAction(this, indexConfig, getIndexManager());
            try {
                deleteAction.execute(suggestedMergeFile);
            } catch (final Exception e) {
                logger.warn("Failed to delete records from Journal File {} from the index; this could potentially result in duplicates. Failure was due to {}", suggestedMergeFile, e.toString());
                if (logger.isDebugEnabled()) {
                    logger.warn("", e);
                }
            }
            // file and the TOC file. Otherwise, we could get the wrong copy and have issues retrieving events.
            if (!suggestedMergeFile.delete()) {
                logger.error("Failed to delete partially written Provenance Journal File {}. This may result in events from this journal " + "file not being able to be displayed. This file should be deleted manually.", suggestedMergeFile);
            }
            final File tocFile = TocUtil.getTocFile(suggestedMergeFile);
            if (tocFile.exists() && !tocFile.delete()) {
                logger.error("Failed to delete .toc file {}; this may result in not being able to read the Provenance Events from the {} Journal File. " + "This can be corrected by manually deleting the {} file", tocFile, suggestedMergeFile, tocFile);
            }
        }
    } else {
        logger.warn("Cannot merge journal files {} because they do not exist on disk", journalFiles);
        return null;
    }
    final long startNanos = System.nanoTime();
    // Map each journal to a RecordReader
    final List<RecordReader> readers = new ArrayList<>();
    int records = 0;
    final boolean isCompress = configuration.isCompressOnRollover();
    final File writerFile = isCompress ? new File(suggestedMergeFile.getParentFile(), suggestedMergeFile.getName() + ".gz") : suggestedMergeFile;
    try {
        for (final File journalFile : availableFiles) {
            try {
                // Use MAX_VALUE for number of chars because we don't want to truncate the value as we write it
                // out. This allows us to later decide that we want more characters and still be able to retrieve
                // the entire event.
                readers.add(RecordReaders.newRecordReader(journalFile, null, Integer.MAX_VALUE));
            } catch (final EOFException eof) {
            // there's nothing here. Skip over it.
            } catch (final IOException ioe) {
                logger.warn("Unable to merge {} with other Journal Files due to {}", journalFile, ioe.toString());
                if (logger.isDebugEnabled()) {
                    logger.warn("", ioe);
                }
                if (eventReporter != null) {
                    eventReporter.reportEvent(Severity.ERROR, EVENT_CATEGORY, "Failed to merge Journal Files due to " + ioe.toString());
                }
            }
        }
        // Create a Map so that the key is the next record available from a reader and the value is the Reader from which
        // the record came. This sorted map is then used so that we are able to always get the first entry, which is the next
        // lowest record id
        final SortedMap<StandardProvenanceEventRecord, RecordReader> recordToReaderMap = new TreeMap<>(new Comparator<StandardProvenanceEventRecord>() {

            @Override
            public int compare(final StandardProvenanceEventRecord o1, final StandardProvenanceEventRecord o2) {
                return Long.compare(o1.getEventId(), o2.getEventId());
            }
        });
        long minEventId = 0L;
        long earliestTimestamp = System.currentTimeMillis();
        for (final RecordReader reader : readers) {
            StandardProvenanceEventRecord record = null;
            try {
                record = reader.nextRecord();
            } catch (final EOFException eof) {
            // record will be null and reader can no longer be used
            } catch (final Exception e) {
                logger.warn("Failed to generate Provenance Event Record from Journal due to " + e + "; it's " + "possible that the record wasn't completely written to the file. This journal will be " + "skipped.");
                if (logger.isDebugEnabled()) {
                    logger.warn("", e);
                }
                if (eventReporter != null) {
                    eventReporter.reportEvent(Severity.WARNING, EVENT_CATEGORY, "Failed to read Provenance Event " + "Record from Journal due to " + e + "; it's possible that the record wasn't " + "completely written to the file. This journal will be skipped.");
                }
            }
            if (record == null) {
                continue;
            }
            if (record.getEventTime() < earliestTimestamp) {
                earliestTimestamp = record.getEventTime();
            }
            if (record.getEventId() < minEventId) {
                minEventId = record.getEventId();
            }
            recordToReaderMap.put(record, reader);
        }
        // We want to keep track of the last 1000 events in the files so that we can add them to 'ringBuffer'.
        // However, we don't want to add them directly to ringBuffer, because once they are added to ringBuffer, they are
        // available in query results. As a result, we can have the issue where we've not finished indexing the file
        // but we try to create the lineage for events in that file. In order to avoid this, we will add the records
        // to a temporary RingBuffer and after we finish merging the records will then copy the data to the
        // ringBuffer provided as a method argument.
        final RingBuffer<ProvenanceEventRecord> latestRecords = new RingBuffer<>(1000);
        // with the next entry from the journal file from which the previous record was written.
        try (final RecordWriter writer = RecordWriters.newSchemaRecordWriter(writerFile, idGenerator, configuration.isCompressOnRollover(), true)) {
            writer.writeHeader(minEventId);
            final IndexingAction indexingAction = createIndexingAction();
            final File indexingDirectory = indexConfig.getWritableIndexDirectory(writerFile, earliestTimestamp);
            long maxId = 0L;
            final BlockingQueue<Tuple<StandardProvenanceEventRecord, Integer>> eventQueue = new LinkedBlockingQueue<>(100);
            final AtomicBoolean finishedAdding = new AtomicBoolean(false);
            final List<Future<?>> futures = new ArrayList<>();
            final EventIndexWriter indexWriter = getIndexManager().borrowIndexWriter(indexingDirectory);
            try {
                final ExecutorService exec = Executors.newFixedThreadPool(configuration.getIndexThreadPoolSize(), new ThreadFactory() {

                    @Override
                    public Thread newThread(final Runnable r) {
                        final Thread t = Executors.defaultThreadFactory().newThread(r);
                        t.setName("Index Provenance Events");
                        return t;
                    }
                });
                final AtomicInteger indexingFailureCount = new AtomicInteger(0);
                try {
                    for (int i = 0; i < configuration.getIndexThreadPoolSize(); i++) {
                        final Callable<Object> callable = new Callable<Object>() {

                            @Override
                            public Object call() throws IOException {
                                while (!eventQueue.isEmpty() || !finishedAdding.get()) {
                                    try {
                                        final Tuple<StandardProvenanceEventRecord, Integer> tuple;
                                        try {
                                            tuple = eventQueue.poll(10, TimeUnit.MILLISECONDS);
                                        } catch (final InterruptedException ie) {
                                            Thread.currentThread().interrupt();
                                            continue;
                                        }
                                        if (tuple == null) {
                                            continue;
                                        }
                                        indexingAction.index(tuple.getKey(), indexWriter.getIndexWriter(), tuple.getValue());
                                    } catch (final Throwable t) {
                                        logger.error("Failed to index Provenance Event for " + writerFile + " to " + indexingDirectory, t);
                                        if (indexingFailureCount.incrementAndGet() >= MAX_INDEXING_FAILURE_COUNT) {
                                            return null;
                                        }
                                    }
                                }
                                return null;
                            }
                        };
                        final Future<?> future = exec.submit(callable);
                        futures.add(future);
                    }
                    boolean indexEvents = true;
                    while (!recordToReaderMap.isEmpty()) {
                        final StandardProvenanceEventRecord record = recordToReaderMap.firstKey();
                        final RecordReader reader = recordToReaderMap.get(record);
                        writer.writeRecord(record);
                        final int blockIndex = writer.getTocWriter().getCurrentBlockIndex();
                        boolean accepted = false;
                        while (!accepted && indexEvents) {
                            try {
                                accepted = eventQueue.offer(new Tuple<>(record, blockIndex), 10, TimeUnit.MILLISECONDS);
                            } catch (final InterruptedException ie) {
                                Thread.currentThread().interrupt();
                            }
                            // So, if the queue is filled, we will check if this is the case.
                            if (!accepted && indexingFailureCount.get() >= MAX_INDEXING_FAILURE_COUNT) {
                                // don't add anything else to the queue.
                                indexEvents = false;
                                eventQueue.clear();
                                final String warning = String.format("Indexing Provenance Events for %s has failed %s times. This exceeds the maximum threshold of %s failures, " + "so no more Provenance Events will be indexed for this Provenance file.", writerFile, indexingFailureCount.get(), MAX_INDEXING_FAILURE_COUNT);
                                logger.warn(warning);
                                if (eventReporter != null) {
                                    eventReporter.reportEvent(Severity.WARNING, EVENT_CATEGORY, warning);
                                }
                            }
                        }
                        maxId = record.getEventId();
                        latestRecords.add(truncateAttributes(record));
                        records++;
                        // Remove this entry from the map
                        recordToReaderMap.remove(record);
                        // Get the next entry from this reader and add it to the map
                        StandardProvenanceEventRecord nextRecord = null;
                        try {
                            nextRecord = reader.nextRecord();
                        } catch (final EOFException eof) {
                        // record will be null and reader can no longer be used
                        } catch (final Exception e) {
                            logger.warn("Failed to generate Provenance Event Record from Journal due to " + e + "; it's possible that the record wasn't completely written to the file. " + "The remainder of this journal will be skipped.");
                            if (logger.isDebugEnabled()) {
                                logger.warn("", e);
                            }
                            if (eventReporter != null) {
                                eventReporter.reportEvent(Severity.WARNING, EVENT_CATEGORY, "Failed to read " + "Provenance Event Record from Journal due to " + e + "; it's possible " + "that the record wasn't completely written to the file. The remainder " + "of this journal will be skipped.");
                            }
                        }
                        if (nextRecord != null) {
                            recordToReaderMap.put(nextRecord, reader);
                        }
                    }
                } finally {
                    finishedAdding.set(true);
                    exec.shutdown();
                }
                for (final Future<?> future : futures) {
                    try {
                        future.get();
                    } catch (final ExecutionException ee) {
                        final Throwable t = ee.getCause();
                        if (t instanceof RuntimeException) {
                            throw (RuntimeException) t;
                        }
                        throw new RuntimeException(t);
                    } catch (final InterruptedException e) {
                        Thread.currentThread().interrupt();
                        throw new RuntimeException("Thread interrupted");
                    }
                }
            } finally {
                getIndexManager().returnIndexWriter(indexWriter);
            }
            indexConfig.setMaxIdIndexed(maxId);
        }
        // record should now be available in the repository. We can copy the values from latestRecords to ringBuffer.
        final RingBuffer<ProvenanceEventRecord> latestRecordBuffer = this.latestRecords;
        latestRecords.forEach(new ForEachEvaluator<ProvenanceEventRecord>() {

            @Override
            public boolean evaluate(final ProvenanceEventRecord event) {
                latestRecordBuffer.add(event);
                return true;
            }
        });
    } finally {
        for (final RecordReader reader : readers) {
            try {
                reader.close();
            } catch (final IOException ioe) {
            }
        }
    }
    // Success. Remove all of the journal files, as they're no longer needed, now that they've been merged.
    for (final File journalFile : availableFiles) {
        if (!journalFile.delete() && journalFile.exists()) {
            logger.warn("Failed to remove temporary journal file {}; this file should be cleaned up manually", journalFile.getAbsolutePath());
            if (eventReporter != null) {
                eventReporter.reportEvent(Severity.WARNING, EVENT_CATEGORY, "Failed to remove temporary journal file " + journalFile.getAbsolutePath() + "; this file should be cleaned up manually");
            }
        }
        final File tocFile = TocUtil.getTocFile(journalFile);
        if (!tocFile.delete() && tocFile.exists()) {
            logger.warn("Failed to remove temporary journal TOC file {}; this file should be cleaned up manually", tocFile.getAbsolutePath());
            if (eventReporter != null) {
                eventReporter.reportEvent(Severity.WARNING, EVENT_CATEGORY, "Failed to remove temporary journal TOC file " + tocFile.getAbsolutePath() + "; this file should be cleaned up manually");
            }
        }
    }
    if (records == 0) {
        writerFile.delete();
        logger.debug("Couldn't merge journals: No Records to merge");
        return null;
    } else {
        final long nanos = System.nanoTime() - startNanos;
        final long millis = TimeUnit.MILLISECONDS.convert(nanos, TimeUnit.NANOSECONDS);
        logger.info("Successfully merged {} journal files ({} records) into single Provenance Log File {} in {} milliseconds", numAvailableFiles, records, suggestedMergeFile, millis);
    }
    return writerFile;
}
Also used : NamedThreadFactory(org.apache.nifi.provenance.util.NamedThreadFactory) ThreadFactory(java.util.concurrent.ThreadFactory) RecordReader(org.apache.nifi.provenance.serialization.RecordReader) ArrayList(java.util.ArrayList) RingBuffer(org.apache.nifi.util.RingBuffer) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Callable(java.util.concurrent.Callable) RecordWriter(org.apache.nifi.provenance.serialization.RecordWriter) DeleteIndexAction(org.apache.nifi.provenance.lucene.DeleteIndexAction) EOFException(java.io.EOFException) ExecutionException(java.util.concurrent.ExecutionException) IndexingAction(org.apache.nifi.provenance.lucene.IndexingAction) IOException(java.io.IOException) TreeMap(java.util.TreeMap) IndexNotFoundException(org.apache.lucene.index.IndexNotFoundException) ResourceNotFoundException(org.apache.nifi.web.ResourceNotFoundException) AccessDeniedException(org.apache.nifi.authorization.AccessDeniedException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) EOFException(java.io.EOFException) FileNotFoundException(java.io.FileNotFoundException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) EventIndexWriter(org.apache.nifi.provenance.index.EventIndexWriter) File(java.io.File) Tuple(org.apache.nifi.util.Tuple)

Aggregations

RecordWriter (org.apache.nifi.provenance.serialization.RecordWriter)35 File (java.io.File)27 Test (org.junit.Test)21 StandardTocWriter (org.apache.nifi.provenance.toc.StandardTocWriter)15 RecordReader (org.apache.nifi.provenance.serialization.RecordReader)14 StandardTocReader (org.apache.nifi.provenance.toc.StandardTocReader)14 TocWriter (org.apache.nifi.provenance.toc.TocWriter)14 HashMap (java.util.HashMap)13 FileInputStream (java.io.FileInputStream)12 ArrayList (java.util.ArrayList)12 TocReader (org.apache.nifi.provenance.toc.TocReader)12 IOException (java.io.IOException)9 Future (java.util.concurrent.Future)8 EOFException (java.io.EOFException)7 ExecutionException (java.util.concurrent.ExecutionException)7 FileNotFoundException (java.io.FileNotFoundException)6 ExecutorService (java.util.concurrent.ExecutorService)6 FlowFile (org.apache.nifi.flowfile.FlowFile)6 TestUtil.createFlowFile (org.apache.nifi.provenance.TestUtil.createFlowFile)6 ResourceNotFoundException (org.apache.nifi.web.ResourceNotFoundException)6