Search in sources :

Example 1 with HDFSRecordReader

use of org.apache.nifi.processors.hadoop.record.HDFSRecordReader in project nifi by apache.

the class FetchParquetTest method testIOExceptionWhileReadingShouldRouteToRetry.

@Test
public void testIOExceptionWhileReadingShouldRouteToRetry() throws IOException, InitializationException {
    final FetchParquet proc = new FetchParquet() {

        @Override
        public HDFSRecordReader createHDFSRecordReader(ProcessContext context, FlowFile flowFile, Configuration conf, Path path) throws IOException {
            return new HDFSRecordReader() {

                @Override
                public Record nextRecord() throws IOException {
                    throw new IOException("IOException");
                }

                @Override
                public void close() throws IOException {
                }
            };
        }
    };
    configure(proc);
    final File parquetDir = new File(DIRECTORY);
    final File parquetFile = new File(parquetDir, "testFetchParquetToCSV.parquet");
    final int numUsers = 10;
    writeParquetUsers(parquetFile, numUsers);
    final Map<String, String> attributes = new HashMap<>();
    attributes.put(CoreAttributes.PATH.key(), parquetDir.getAbsolutePath());
    attributes.put(CoreAttributes.FILENAME.key(), parquetFile.getName());
    testRunner.enqueue("TRIGGER", attributes);
    testRunner.run();
    testRunner.assertAllFlowFilesTransferred(FetchParquet.REL_RETRY, 1);
    final MockFlowFile flowFile = testRunner.getFlowFilesForRelationship(FetchParquet.REL_RETRY).get(0);
    flowFile.assertContentEquals("TRIGGER");
}
Also used : Path(org.apache.hadoop.fs.Path) FlowFile(org.apache.nifi.flowfile.FlowFile) MockFlowFile(org.apache.nifi.util.MockFlowFile) Configuration(org.apache.hadoop.conf.Configuration) HashMap(java.util.HashMap) IOException(java.io.IOException) ProcessContext(org.apache.nifi.processor.ProcessContext) MockFlowFile(org.apache.nifi.util.MockFlowFile) FlowFile(org.apache.nifi.flowfile.FlowFile) File(java.io.File) MockFlowFile(org.apache.nifi.util.MockFlowFile) HDFSRecordReader(org.apache.nifi.processors.hadoop.record.HDFSRecordReader) Test(org.junit.Test)

Example 2 with HDFSRecordReader

use of org.apache.nifi.processors.hadoop.record.HDFSRecordReader in project nifi by apache.

the class AbstractFetchHDFSRecord method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) throws ProcessException {
    // do this before getting a flow file so that we always get a chance to attempt Kerberos relogin
    final FileSystem fileSystem = getFileSystem();
    final Configuration configuration = getConfiguration();
    final UserGroupInformation ugi = getUserGroupInformation();
    if (configuration == null || fileSystem == null || ugi == null) {
        getLogger().error("Processor not configured properly because Configuration, FileSystem, or UserGroupInformation was null");
        context.yield();
        return;
    }
    final FlowFile originalFlowFile = session.get();
    if (originalFlowFile == null) {
        context.yield();
        return;
    }
    ugi.doAs((PrivilegedAction<Object>) () -> {
        FlowFile child = null;
        final String filenameValue = context.getProperty(FILENAME).evaluateAttributeExpressions(originalFlowFile).getValue();
        try {
            final Path path = new Path(filenameValue);
            final AtomicReference<Throwable> exceptionHolder = new AtomicReference<>(null);
            final AtomicReference<WriteResult> writeResult = new AtomicReference<>();
            final RecordSetWriterFactory recordSetWriterFactory = context.getProperty(RECORD_WRITER).asControllerService(RecordSetWriterFactory.class);
            final StopWatch stopWatch = new StopWatch(true);
            // use a child FlowFile so that if any error occurs we can route the original untouched FlowFile to retry/failure
            child = session.create(originalFlowFile);
            final AtomicReference<String> mimeTypeRef = new AtomicReference<>();
            child = session.write(child, (final OutputStream rawOut) -> {
                try (final BufferedOutputStream out = new BufferedOutputStream(rawOut);
                    final HDFSRecordReader recordReader = createHDFSRecordReader(context, originalFlowFile, configuration, path)) {
                    Record record = recordReader.nextRecord();
                    final RecordSchema schema = recordSetWriterFactory.getSchema(originalFlowFile.getAttributes(), record == null ? null : record.getSchema());
                    try (final RecordSetWriter recordSetWriter = recordSetWriterFactory.createWriter(getLogger(), schema, out)) {
                        recordSetWriter.beginRecordSet();
                        if (record != null) {
                            recordSetWriter.write(record);
                        }
                        while ((record = recordReader.nextRecord()) != null) {
                            recordSetWriter.write(record);
                        }
                        writeResult.set(recordSetWriter.finishRecordSet());
                        mimeTypeRef.set(recordSetWriter.getMimeType());
                    }
                } catch (Exception e) {
                    exceptionHolder.set(e);
                }
            });
            stopWatch.stop();
            // into one of the appropriate catch blocks below
            if (exceptionHolder.get() != null) {
                throw exceptionHolder.get();
            }
            FlowFile successFlowFile = postProcess(context, session, child, path);
            final Map<String, String> attributes = new HashMap<>(writeResult.get().getAttributes());
            attributes.put(RECORD_COUNT_ATTR, String.valueOf(writeResult.get().getRecordCount()));
            attributes.put(CoreAttributes.MIME_TYPE.key(), mimeTypeRef.get());
            successFlowFile = session.putAllAttributes(successFlowFile, attributes);
            final Path qualifiedPath = path.makeQualified(fileSystem.getUri(), fileSystem.getWorkingDirectory());
            getLogger().info("Successfully received content from {} for {} in {} milliseconds", new Object[] { qualifiedPath, successFlowFile, stopWatch.getDuration() });
            session.getProvenanceReporter().fetch(successFlowFile, qualifiedPath.toString(), stopWatch.getDuration(TimeUnit.MILLISECONDS));
            session.transfer(successFlowFile, REL_SUCCESS);
            session.remove(originalFlowFile);
            return null;
        } catch (final FileNotFoundException | AccessControlException e) {
            getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure", new Object[] { filenameValue, originalFlowFile, e });
            final FlowFile failureFlowFile = session.putAttribute(originalFlowFile, FETCH_FAILURE_REASON_ATTR, e.getMessage() == null ? e.toString() : e.getMessage());
            session.transfer(failureFlowFile, REL_FAILURE);
        } catch (final IOException | FlowFileAccessException e) {
            getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to retry", new Object[] { filenameValue, originalFlowFile, e });
            session.transfer(session.penalize(originalFlowFile), REL_RETRY);
            context.yield();
        } catch (final Throwable t) {
            getLogger().error("Failed to retrieve content from {} for {} due to {}; routing to failure", new Object[] { filenameValue, originalFlowFile, t });
            final FlowFile failureFlowFile = session.putAttribute(originalFlowFile, FETCH_FAILURE_REASON_ATTR, t.getMessage() == null ? t.toString() : t.getMessage());
            session.transfer(failureFlowFile, REL_FAILURE);
        }
        // if we got this far then we weren't successful so we need to clean up the child flow file if it got initialized
        if (child != null) {
            session.remove(child);
        }
        return null;
    });
}
Also used : Path(org.apache.hadoop.fs.Path) FlowFile(org.apache.nifi.flowfile.FlowFile) Configuration(org.apache.hadoop.conf.Configuration) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) AtomicReference(java.util.concurrent.atomic.AtomicReference) RecordSetWriter(org.apache.nifi.serialization.RecordSetWriter) ProcessException(org.apache.nifi.processor.exception.ProcessException) FlowFileAccessException(org.apache.nifi.processor.exception.FlowFileAccessException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) AccessControlException(org.apache.hadoop.security.AccessControlException) StopWatch(org.apache.nifi.util.StopWatch) RecordSetWriterFactory(org.apache.nifi.serialization.RecordSetWriterFactory) FileSystem(org.apache.hadoop.fs.FileSystem) Record(org.apache.nifi.serialization.record.Record) BufferedOutputStream(java.io.BufferedOutputStream) RecordSchema(org.apache.nifi.serialization.record.RecordSchema) HashMap(java.util.HashMap) Map(java.util.Map) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) HDFSRecordReader(org.apache.nifi.processors.hadoop.record.HDFSRecordReader)

Aggregations

IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 Configuration (org.apache.hadoop.conf.Configuration)2 Path (org.apache.hadoop.fs.Path)2 FlowFile (org.apache.nifi.flowfile.FlowFile)2 HDFSRecordReader (org.apache.nifi.processors.hadoop.record.HDFSRecordReader)2 BufferedOutputStream (java.io.BufferedOutputStream)1 File (java.io.File)1 FileNotFoundException (java.io.FileNotFoundException)1 OutputStream (java.io.OutputStream)1 Map (java.util.Map)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 AccessControlException (org.apache.hadoop.security.AccessControlException)1 UserGroupInformation (org.apache.hadoop.security.UserGroupInformation)1 ProcessContext (org.apache.nifi.processor.ProcessContext)1 FlowFileAccessException (org.apache.nifi.processor.exception.FlowFileAccessException)1 ProcessException (org.apache.nifi.processor.exception.ProcessException)1 RecordSetWriter (org.apache.nifi.serialization.RecordSetWriter)1 RecordSetWriterFactory (org.apache.nifi.serialization.RecordSetWriterFactory)1