Search in sources :

Example 6 with NullOutputStream

use of org.apache.nifi.stream.io.NullOutputStream in project nifi by apache.

the class TestSimpleSwapSerializerDeserializer method testWritePerformance.

@Test
@Ignore("For manual testing only. Not intended to be run as part of the automated unit tests but can " + "be convenient for determining a baseline for performance if making modifications.")
public void testWritePerformance() throws IOException, InterruptedException {
    final ResourceClaimManager resourceClaimManager = new StandardResourceClaimManager();
    final List<FlowFileRecord> toSwap = new ArrayList<>(10000);
    final Map<String, String> attrs = new HashMap<>();
    for (int i = 0; i < 10000; i++) {
        attrs.put("i", String.valueOf(i));
        final FlowFileRecord ff = new MockFlowFile(attrs, i, resourceClaimManager);
        toSwap.add(ff);
    }
    final FlowFileQueue flowFileQueue = Mockito.mock(FlowFileQueue.class);
    Mockito.when(flowFileQueue.getIdentifier()).thenReturn("87bb99fe-412c-49f6-a441-d1b0af4e20b4");
    final String swapLocation = "target/testRoundTrip.swap";
    final int iterations = 1000;
    final long start = System.nanoTime();
    final SwapSerializer serializer = new SimpleSwapSerializer();
    for (int i = 0; i < iterations; i++) {
        try (final OutputStream out = new NullOutputStream()) {
            serializer.serializeFlowFiles(toSwap, flowFileQueue, swapLocation, out);
        }
    }
    final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start);
    System.out.println("Wrote " + iterations + " Swap Files in " + millis + " millis");
}
Also used : HashMap(java.util.HashMap) OutputStream(java.io.OutputStream) NullOutputStream(org.apache.nifi.stream.io.NullOutputStream) FileOutputStream(java.io.FileOutputStream) ArrayList(java.util.ArrayList) StandardResourceClaimManager(org.apache.nifi.controller.repository.claim.StandardResourceClaimManager) ResourceClaimManager(org.apache.nifi.controller.repository.claim.ResourceClaimManager) FlowFileQueue(org.apache.nifi.controller.queue.FlowFileQueue) StandardResourceClaimManager(org.apache.nifi.controller.repository.claim.StandardResourceClaimManager) FlowFileRecord(org.apache.nifi.controller.repository.FlowFileRecord) NullOutputStream(org.apache.nifi.stream.io.NullOutputStream) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 7 with NullOutputStream

use of org.apache.nifi.stream.io.NullOutputStream in project nifi by apache.

the class HashContent method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final ComponentLog logger = getLogger();
    final String algorithm = context.getProperty(HASH_ALGORITHM).getValue();
    final MessageDigest digest;
    try {
        digest = MessageDigest.getInstance(algorithm);
    } catch (NoSuchAlgorithmException e) {
        logger.error("Failed to process {} due to {}; routing to failure", new Object[] { flowFile, e });
        session.transfer(flowFile, REL_FAILURE);
        return;
    }
    final AtomicReference<String> hashValueHolder = new AtomicReference<>(null);
    try {
        session.read(flowFile, new InputStreamCallback() {

            @Override
            public void process(final InputStream in) throws IOException {
                try (final DigestOutputStream digestOut = new DigestOutputStream(new NullOutputStream(), digest)) {
                    StreamUtils.copy(in, digestOut);
                    final byte[] hash = digest.digest();
                    final StringBuilder strb = new StringBuilder(hash.length * 2);
                    for (int i = 0; i < hash.length; i++) {
                        strb.append(Integer.toHexString((hash[i] & 0xFF) | 0x100).substring(1, 3));
                    }
                    hashValueHolder.set(strb.toString());
                }
            }
        });
        final String attributeName = context.getProperty(ATTRIBUTE_NAME).getValue();
        flowFile = session.putAttribute(flowFile, attributeName, hashValueHolder.get());
        logger.info("Successfully added attribute '{}' to {} with a value of {}; routing to success", new Object[] { attributeName, flowFile, hashValueHolder.get() });
        session.getProvenanceReporter().modifyAttributes(flowFile);
        session.transfer(flowFile, REL_SUCCESS);
    } catch (final ProcessException e) {
        logger.error("Failed to process {} due to {}; routing to failure", new Object[] { flowFile, e });
        session.transfer(flowFile, REL_FAILURE);
    }
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) InputStream(java.io.InputStream) AtomicReference(java.util.concurrent.atomic.AtomicReference) NoSuchAlgorithmException(java.security.NoSuchAlgorithmException) IOException(java.io.IOException) ComponentLog(org.apache.nifi.logging.ComponentLog) ProcessException(org.apache.nifi.processor.exception.ProcessException) DigestOutputStream(java.security.DigestOutputStream) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) MessageDigest(java.security.MessageDigest) NullOutputStream(org.apache.nifi.stream.io.NullOutputStream)

Example 8 with NullOutputStream

use of org.apache.nifi.stream.io.NullOutputStream in project nifi by apache.

the class TestStandardRecordReaderWriter method testWritePerformance.

@Test
@Ignore("For local testing only")
public void testWritePerformance() throws IOException {
    // This is a simple micro-benchmarking test so that we can determine how fast the serialization/deserialization is before
    // making significant changes. This allows us to ensure that changes that we make do not have significant adverse effects
    // on performance of the repository.
    final ProvenanceEventRecord event = createEvent();
    final TocWriter tocWriter = new NopTocWriter();
    final int numEvents = 10_000_000;
    final long startNanos = System.nanoTime();
    try (final OutputStream nullOut = new NullOutputStream();
        final RecordWriter writer = new StandardRecordWriter(nullOut, "devnull", idGenerator, tocWriter, false, 100000)) {
        writer.writeHeader(0L);
        for (int i = 0; i < numEvents; i++) {
            writer.writeRecord(event);
        }
    }
    final long nanos = System.nanoTime() - startNanos;
    final long millis = TimeUnit.NANOSECONDS.toMillis(nanos);
    System.out.println("Took " + millis + " millis to write " + numEvents + " events");
}
Also used : RecordWriter(org.apache.nifi.provenance.serialization.RecordWriter) NopTocWriter(org.apache.nifi.provenance.toc.NopTocWriter) NopTocWriter(org.apache.nifi.provenance.toc.NopTocWriter) TocWriter(org.apache.nifi.provenance.toc.TocWriter) OutputStream(java.io.OutputStream) NullOutputStream(org.apache.nifi.stream.io.NullOutputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DataOutputStream(java.io.DataOutputStream) NullOutputStream(org.apache.nifi.stream.io.NullOutputStream) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 9 with NullOutputStream

use of org.apache.nifi.stream.io.NullOutputStream in project nifi by apache.

the class TailFile method processTailFile.

private void processTailFile(final ProcessContext context, final ProcessSession session, final String tailFile) {
    // If user changes the file that is being tailed, we need to consume the already-rolled-over data according
    // to the Initial Start Position property
    boolean rolloverOccurred;
    TailFileObject tfo = states.get(tailFile);
    if (tfo.isTailFileChanged()) {
        rolloverOccurred = false;
        final String recoverPosition = context.getProperty(START_POSITION).getValue();
        if (START_BEGINNING_OF_TIME.getValue().equals(recoverPosition)) {
            recoverRolledFiles(context, session, tailFile, tfo.getExpectedRecoveryChecksum(), tfo.getState().getTimestamp(), tfo.getState().getPosition());
        } else if (START_CURRENT_FILE.getValue().equals(recoverPosition)) {
            cleanup();
            tfo.setState(new TailFileState(tailFile, null, null, 0L, 0L, 0L, null, tfo.getState().getBuffer()));
        } else {
            final String filename = tailFile;
            final File file = new File(filename);
            try {
                final FileChannel fileChannel = FileChannel.open(file.toPath(), StandardOpenOption.READ);
                getLogger().debug("Created FileChannel {} for {}", new Object[] { fileChannel, file });
                final Checksum checksum = new CRC32();
                final long position = file.length();
                final long timestamp = file.lastModified();
                try (final InputStream fis = new FileInputStream(file);
                    final CheckedInputStream in = new CheckedInputStream(fis, checksum)) {
                    StreamUtils.copy(in, new NullOutputStream(), position);
                }
                fileChannel.position(position);
                cleanup();
                tfo.setState(new TailFileState(filename, file, fileChannel, position, timestamp, file.length(), checksum, tfo.getState().getBuffer()));
            } catch (final IOException ioe) {
                getLogger().error("Attempted to position Reader at current position in file {} but failed to do so due to {}", new Object[] { file, ioe.toString() }, ioe);
                context.yield();
                return;
            }
        }
        tfo.setTailFileChanged(false);
    } else {
        // Recover any data that may have rolled over since the last time that this processor ran.
        // If expectedRecoveryChecksum != null, that indicates that this is the first iteration since processor was started, so use whatever checksum value
        // was present when the state was last persisted. In this case, we must then null out the value so that the next iteration won't keep using the "recovered"
        // value. If the value is null, then we know that either the processor has already recovered that data, or there was no state persisted. In either case,
        // use whatever checksum value is currently in the state.
        Long expectedChecksumValue = tfo.getExpectedRecoveryChecksum();
        if (expectedChecksumValue == null) {
            expectedChecksumValue = tfo.getState().getChecksum() == null ? null : tfo.getState().getChecksum().getValue();
        }
        rolloverOccurred = recoverRolledFiles(context, session, tailFile, expectedChecksumValue, tfo.getState().getTimestamp(), tfo.getState().getPosition());
        tfo.setExpectedRecoveryChecksum(null);
    }
    // initialize local variables from state object; this is done so that we can easily change the values throughout
    // the onTrigger method and then create a new state object after we finish processing the files.
    TailFileState state = tfo.getState();
    File file = state.getFile();
    FileChannel reader = state.getReader();
    Checksum checksum = state.getChecksum();
    if (checksum == null) {
        checksum = new CRC32();
    }
    long position = state.getPosition();
    long timestamp = state.getTimestamp();
    long length = state.getLength();
    // Create a reader if necessary.
    if (file == null || reader == null) {
        file = new File(tailFile);
        reader = createReader(file, position);
        if (reader == null) {
            context.yield();
            return;
        }
    }
    final long startNanos = System.nanoTime();
    // Check if file has rotated
    // We determine that the file has rotated if any of the following conditions are met:
    // 1. 'rolloverOccured' == true, which indicates that we have found a new file matching the rollover pattern.
    // 2. The file was modified after the timestamp in our state, AND the file is smaller than we expected. This satisfies
    // the case where we are tailing File A, and that file is then renamed (say to B) and a new file named A is created
    // and is written to. In such a case, File A may have a file size smaller than we have in our state, so we know that
    // it rolled over.
    // 3. The File Channel that we have indicates that the size of the file is different than file.length() indicates, AND
    // the File Channel also indicates that we have read all data in the file. This case may also occur in the same scenario
    // as #2, above. In this case, the File Channel is pointing to File A, but the 'file' object is pointing to File B. They
    // both have the same name but are different files. As a result, once we have consumed all data from the File Channel,
    // we want to roll over and consume data from the new file.
    boolean rotated = rolloverOccurred;
    if (!rotated) {
        final long fileLength = file.length();
        if (length > fileLength) {
            rotated = true;
        } else {
            try {
                final long readerSize = reader.size();
                final long readerPosition = reader.position();
                if (readerSize == readerPosition && readerSize != fileLength) {
                    rotated = true;
                }
            } catch (final IOException e) {
                getLogger().warn("Failed to determined the size or position of the File Channel when " + "determining if the file has rolled over. Will assume that the file being tailed has not rolled over", e);
            }
        }
    }
    if (rotated) {
        // Since file has rotated, we close the reader, create a new one, and then reset our state.
        try {
            reader.close();
            getLogger().debug("Closed FileChannel {}", new Object[] { reader, reader });
        } catch (final IOException ioe) {
            getLogger().warn("Failed to close reader for {} due to {}", new Object[] { file, ioe });
        }
        reader = createReader(file, 0L);
        position = 0L;
        checksum.reset();
    }
    if (file.length() == position || !file.exists()) {
        // no data to consume so rather than continually running, yield to allow other processors to use the thread.
        getLogger().debug("No data to consume; created no FlowFiles");
        tfo.setState(new TailFileState(tailFile, file, reader, position, timestamp, length, checksum, state.getBuffer()));
        persistState(tfo, context);
        context.yield();
        return;
    }
    // If there is data to consume, read as much as we can.
    final TailFileState currentState = state;
    final Checksum chksum = checksum;
    // data has been written to file. Stream it to a new FlowFile.
    FlowFile flowFile = session.create();
    final FileChannel fileReader = reader;
    final AtomicLong positionHolder = new AtomicLong(position);
    flowFile = session.write(flowFile, new OutputStreamCallback() {

        @Override
        public void process(final OutputStream rawOut) throws IOException {
            try (final OutputStream out = new BufferedOutputStream(rawOut)) {
                positionHolder.set(readLines(fileReader, currentState.getBuffer(), out, chksum));
            }
        }
    });
    // If there ended up being no data, just remove the FlowFile
    if (flowFile.getSize() == 0) {
        session.remove(flowFile);
        getLogger().debug("No data to consume; removed created FlowFile");
    } else {
        // determine filename for FlowFile by using <base filename of log file>.<initial offset>-<final offset>.<extension>
        final String tailFilename = file.getName();
        final String baseName = StringUtils.substringBeforeLast(tailFilename, ".");
        final String flowFileName;
        if (baseName.length() < tailFilename.length()) {
            flowFileName = baseName + "." + position + "-" + positionHolder.get() + "." + StringUtils.substringAfterLast(tailFilename, ".");
        } else {
            flowFileName = baseName + "." + position + "-" + positionHolder.get();
        }
        final Map<String, String> attributes = new HashMap<>(3);
        attributes.put(CoreAttributes.FILENAME.key(), flowFileName);
        attributes.put(CoreAttributes.MIME_TYPE.key(), "text/plain");
        attributes.put("tailfile.original.path", tailFile);
        flowFile = session.putAllAttributes(flowFile, attributes);
        session.getProvenanceReporter().receive(flowFile, file.toURI().toString(), "FlowFile contains bytes " + position + " through " + positionHolder.get() + " of source file", TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startNanos));
        session.transfer(flowFile, REL_SUCCESS);
        position = positionHolder.get();
        // Set timestamp to the latest of when the file was modified and the current timestamp stored in the state.
        // We do this because when we read a file that has been rolled over, we set the state to 1 millisecond later than the last mod date
        // in order to avoid ingesting that file again. If we then read from this file during the same second (or millisecond, depending on the
        // operating system file last mod precision), then we could set the timestamp to a smaller value, which could result in reading in the
        // rotated file a second time.
        timestamp = Math.max(state.getTimestamp(), file.lastModified());
        length = file.length();
        getLogger().debug("Created {} and routed to success", new Object[] { flowFile });
    }
    // Create a new state object to represent our current position, timestamp, etc.
    tfo.setState(new TailFileState(tailFile, file, reader, position, timestamp, length, checksum, state.getBuffer()));
    // We must commit session before persisting state in order to avoid data loss on restart
    session.commit();
    persistState(tfo, context);
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) CRC32(java.util.zip.CRC32) HashMap(java.util.HashMap) FileChannel(java.nio.channels.FileChannel) CheckedInputStream(java.util.zip.CheckedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) NullOutputStream(org.apache.nifi.stream.io.NullOutputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CheckedInputStream(java.util.zip.CheckedInputStream) AtomicLong(java.util.concurrent.atomic.AtomicLong) Checksum(java.util.zip.Checksum) AtomicLong(java.util.concurrent.atomic.AtomicLong) OutputStreamCallback(org.apache.nifi.processor.io.OutputStreamCallback) FlowFile(org.apache.nifi.flowfile.FlowFile) File(java.io.File) BufferedOutputStream(java.io.BufferedOutputStream) NullOutputStream(org.apache.nifi.stream.io.NullOutputStream)

Example 10 with NullOutputStream

use of org.apache.nifi.stream.io.NullOutputStream in project nifi by apache.

the class TailFile method recoverState.

/**
 * Updates member variables to reflect the "expected recovery checksum" and
 * seek to the appropriate location in the tailed file, updating our
 * checksum, so that we are ready to proceed with the
 * {@link #onTrigger(ProcessContext, ProcessSession)} call.
 *
 * @param context the ProcessContext
 * @param stateValues the values that were recovered from state that was
 * previously stored. This Map should be populated with the keys defined in
 * {@link TailFileState.StateKeys}.
 * @param filePath the file of the file for which state must be recovered
 * @throws IOException if unable to seek to the appropriate location in the
 * tailed file.
 */
private void recoverState(final ProcessContext context, final Map<String, String> stateValues, final String filePath) throws IOException {
    final String prefix = MAP_PREFIX + states.get(filePath).getFilenameIndex() + '.';
    if (!stateValues.containsKey(prefix + TailFileState.StateKeys.FILENAME)) {
        resetState(filePath);
        return;
    }
    if (!stateValues.containsKey(prefix + TailFileState.StateKeys.POSITION)) {
        resetState(filePath);
        return;
    }
    if (!stateValues.containsKey(prefix + TailFileState.StateKeys.TIMESTAMP)) {
        resetState(filePath);
        return;
    }
    if (!stateValues.containsKey(prefix + TailFileState.StateKeys.LENGTH)) {
        resetState(filePath);
        return;
    }
    final String checksumValue = stateValues.get(prefix + TailFileState.StateKeys.CHECKSUM);
    final boolean checksumPresent = (checksumValue != null);
    final String storedStateFilename = stateValues.get(prefix + TailFileState.StateKeys.FILENAME);
    final long position = Long.parseLong(stateValues.get(prefix + TailFileState.StateKeys.POSITION));
    final long timestamp = Long.parseLong(stateValues.get(prefix + TailFileState.StateKeys.TIMESTAMP));
    final long length = Long.parseLong(stateValues.get(prefix + TailFileState.StateKeys.LENGTH));
    FileChannel reader = null;
    File tailFile = null;
    if (checksumPresent && filePath.equals(storedStateFilename)) {
        states.get(filePath).setExpectedRecoveryChecksum(Long.parseLong(checksumValue));
        // We have an expected checksum and the currently configured filename is the same as the state file.
        // We need to check if the existing file is the same as the one referred to in the state file based on
        // the checksum.
        final Checksum checksum = new CRC32();
        final File existingTailFile = new File(storedStateFilename);
        if (existingTailFile.length() >= position) {
            try (final InputStream tailFileIs = new FileInputStream(existingTailFile);
                final CheckedInputStream in = new CheckedInputStream(tailFileIs, checksum)) {
                try {
                    StreamUtils.copy(in, new NullOutputStream(), states.get(filePath).getState().getPosition());
                } catch (final EOFException eof) {
                    // If we hit EOFException, then the file is smaller than we expected. Assume rollover.
                    getLogger().debug("When recovering state, file being tailed has less data than was stored in the state. " + "Assuming rollover. Will begin tailing current file from beginning.");
                }
                final long checksumResult = in.getChecksum().getValue();
                if (checksumResult == states.get(filePath).getExpectedRecoveryChecksum()) {
                    // Checksums match. This means that we want to resume reading from where we left off.
                    // So we will populate the reader object so that it will be used in onTrigger. If the
                    // checksums do not match, then we will leave the reader object null, so that the next
                    // call to onTrigger will result in a new Reader being created and starting at the
                    // beginning of the file.
                    getLogger().debug("When recovering state, checksum of tailed file matches the stored checksum. Will resume where left off.");
                    tailFile = existingTailFile;
                    reader = FileChannel.open(tailFile.toPath(), StandardOpenOption.READ);
                    getLogger().debug("Created FileChannel {} for {} in recoverState", new Object[] { reader, tailFile });
                    reader.position(position);
                } else {
                    // we don't seek the reader to the position, so our reader will start at beginning of file.
                    getLogger().debug("When recovering state, checksum of tailed file does not match the stored checksum. Will begin tailing current file from beginning.");
                }
            }
        } else {
            // fewer bytes than our position, so we know we weren't already reading from this file. Keep reader at a position of 0.
            getLogger().debug("When recovering state, existing file to tail is only {} bytes but position flag is {}; " + "this indicates that the file has rotated. Will begin tailing current file from beginning.", new Object[] { existingTailFile.length(), position });
        }
        states.get(filePath).setState(new TailFileState(filePath, tailFile, reader, position, timestamp, length, checksum, ByteBuffer.allocate(65536)));
    } else {
        resetState(filePath);
    }
    getLogger().debug("Recovered state {}", new Object[] { states.get(filePath).getState() });
}
Also used : CRC32(java.util.zip.CRC32) FileChannel(java.nio.channels.FileChannel) CheckedInputStream(java.util.zip.CheckedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) FileInputStream(java.io.FileInputStream) CheckedInputStream(java.util.zip.CheckedInputStream) Checksum(java.util.zip.Checksum) EOFException(java.io.EOFException) FlowFile(org.apache.nifi.flowfile.FlowFile) File(java.io.File) NullOutputStream(org.apache.nifi.stream.io.NullOutputStream)

Aggregations

NullOutputStream (org.apache.nifi.stream.io.NullOutputStream)12 Test (org.junit.Test)7 IOException (java.io.IOException)5 OutputStream (java.io.OutputStream)5 InputStream (java.io.InputStream)4 HashMap (java.util.HashMap)4 FlowFile (org.apache.nifi.flowfile.FlowFile)4 Ignore (org.junit.Ignore)4 ByteArrayOutputStream (java.io.ByteArrayOutputStream)3 File (java.io.File)3 FileInputStream (java.io.FileInputStream)3 URL (java.net.URL)3 CRC32 (java.util.zip.CRC32)3 CheckedInputStream (java.util.zip.CheckedInputStream)3 TestRunner (org.apache.nifi.util.TestRunner)3 BufferedOutputStream (java.io.BufferedOutputStream)2 DataOutputStream (java.io.DataOutputStream)2 FileOutputStream (java.io.FileOutputStream)2 HttpURLConnection (java.net.HttpURLConnection)2 FileChannel (java.nio.channels.FileChannel)2