Search in sources :

Example 1 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project Xponents by OpenSextant.

the class EmbeddedContentConverter method conversionImplementation.

/**
     * Convert Embedded documents in the supported types to a folder of the embedded items.
     * Trivial embedded icons and other components will not be extracted
     *
     */
@Override
protected ConvertedDocument conversionImplementation(InputStream in, File doc) throws IOException {
    ConvertedDocument compoundDoc = super.conversionImplementation(in, doc);
    String ext = FilenameUtils.getExtension(doc.getName());
    if (!isSupported(ext)) {
        // Not really compound by our standards here.
        return compoundDoc;
    }
    ParserContainerExtractor extractor = new ParserContainerExtractor();
    EmbeddedObjectExtractor objExtractor = new EmbeddedObjectExtractor(compoundDoc, true);
    TikaInputStream tikaStream = null;
    try {
        tikaStream = TikaInputStream.get(doc.toPath());
        extractor.extract(tikaStream, extractor, objExtractor);
        compoundDoc.is_converted = true;
        if (compoundDoc.hasRawChildren()) {
            // Create text buffer for this compound document here.
            // If raw children should be post-processed by some other means, that is up to caller.
            // This parent document at least contains a complete text representation of the content in the original doc.
            StringBuilder completeText = new StringBuilder();
            completeText.append(compoundDoc.getText());
            completeText.append("\n==Embedded Objects==\n");
            completeText.append(renderText(compoundDoc.getRawChildren()));
            compoundDoc.setText(completeText.toString());
            compoundDoc.is_converted = true;
            return compoundDoc;
        } else {
            // Try the simple approach.
            return compoundDoc;
        }
    } catch (Exception e) {
        throw new IOException("Stream parsing problem", e);
    } finally {
        tikaStream.close();
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) ConvertedDocument(org.opensextant.xtext.ConvertedDocument) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) IOException(java.io.IOException) MimeTypeException(org.apache.tika.mime.MimeTypeException)

Example 2 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project nifi by apache.

the class ExtractMediaMetadata method tika_parse.

private Map<String, String> tika_parse(InputStream sourceStream, String prefix, Integer maxAttribs, Integer maxAttribLen) throws IOException, TikaException, SAXException {
    final Metadata metadata = new Metadata();
    final TikaInputStream tikaInputStream = TikaInputStream.get(sourceStream);
    try {
        autoDetectParser.parse(tikaInputStream, new DefaultHandler(), metadata);
    } finally {
        tikaInputStream.close();
    }
    final Map<String, String> results = new HashMap<>();
    final Pattern metadataKeyFilter = metadataKeyFilterRef.get();
    final StringBuilder dataBuilder = new StringBuilder();
    for (final String key : metadata.names()) {
        if (metadataKeyFilter != null && !metadataKeyFilter.matcher(key).matches()) {
            continue;
        }
        dataBuilder.setLength(0);
        if (metadata.isMultiValued(key)) {
            for (String val : metadata.getValues(key)) {
                if (dataBuilder.length() > 1) {
                    dataBuilder.append(", ");
                }
                if (dataBuilder.length() + val.length() < maxAttribLen) {
                    dataBuilder.append(val);
                } else {
                    dataBuilder.append("...");
                    break;
                }
            }
        } else {
            dataBuilder.append(metadata.get(key));
        }
        if (prefix == null) {
            results.put(key, dataBuilder.toString().trim());
        } else {
            results.put(prefix + key, dataBuilder.toString().trim());
        }
        // cutoff at max if provided
        if (maxAttribs != null && results.size() >= maxAttribs) {
            break;
        }
    }
    return results;
}
Also used : Pattern(java.util.regex.Pattern) HashMap(java.util.HashMap) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 3 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project nifi by apache.

the class IdentifyMimeType method onTrigger.

@Override
public void onTrigger(final ProcessContext context, final ProcessSession session) {
    FlowFile flowFile = session.get();
    if (flowFile == null) {
        return;
    }
    final ComponentLog logger = getLogger();
    final AtomicReference<String> mimeTypeRef = new AtomicReference<>(null);
    final String filename = flowFile.getAttribute(CoreAttributes.FILENAME.key());
    session.read(flowFile, new InputStreamCallback() {

        @Override
        public void process(final InputStream stream) throws IOException {
            try (final InputStream in = new BufferedInputStream(stream)) {
                TikaInputStream tikaStream = TikaInputStream.get(in);
                Metadata metadata = new Metadata();
                if (filename != null && context.getProperty(USE_FILENAME_IN_DETECTION).asBoolean()) {
                    metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
                }
                // Get mime type
                MediaType mediatype = detector.detect(tikaStream, metadata);
                mimeTypeRef.set(mediatype.toString());
            }
        }
    });
    String mimeType = mimeTypeRef.get();
    String extension = "";
    try {
        MimeType mimetype;
        mimetype = config.getMimeRepository().forName(mimeType);
        extension = mimetype.getExtension();
    } catch (MimeTypeException ex) {
        logger.warn("MIME type extension lookup failed: {}", new Object[] { ex });
    }
    // Workaround for bug in Tika - https://issues.apache.org/jira/browse/TIKA-1563
    if (mimeType != null && mimeType.equals("application/gzip") && extension.equals(".tgz")) {
        extension = ".gz";
    }
    if (mimeType == null) {
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), "application/octet-stream");
        flowFile = session.putAttribute(flowFile, "mime.extension", "");
        logger.info("Unable to identify MIME Type for {}; setting to application/octet-stream", new Object[] { flowFile });
    } else {
        flowFile = session.putAttribute(flowFile, CoreAttributes.MIME_TYPE.key(), mimeType);
        flowFile = session.putAttribute(flowFile, "mime.extension", extension);
        logger.info("Identified {} as having MIME Type {}", new Object[] { flowFile, mimeType });
    }
    session.getProvenanceReporter().modifyAttributes(flowFile);
    session.transfer(flowFile, REL_SUCCESS);
}
Also used : FlowFile(org.apache.nifi.flowfile.FlowFile) BufferedInputStream(java.io.BufferedInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) ComponentLog(org.apache.nifi.logging.ComponentLog) MimeType(org.apache.tika.mime.MimeType) BufferedInputStream(java.io.BufferedInputStream) MimeTypeException(org.apache.tika.mime.MimeTypeException) InputStreamCallback(org.apache.nifi.processor.io.InputStreamCallback) MediaType(org.apache.tika.mime.MediaType)

Example 4 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project nifi by apache.

the class ContentViewerController method doGet.

/**
 * Gets the content and defers to registered viewers to generate the markup.
 *
 * @param request servlet request
 * @param response servlet response
 * @throws ServletException if a servlet-specific error occurs
 * @throws IOException if an I/O error occurs
 */
@Override
protected void doGet(final HttpServletRequest request, final HttpServletResponse response) throws ServletException, IOException {
    // specify the charset in a response header
    response.addHeader("Content-Type", "text/html; charset=UTF-8");
    // get the content
    final ServletContext servletContext = request.getServletContext();
    final ContentAccess contentAccess = (ContentAccess) servletContext.getAttribute("nifi-content-access");
    final ContentRequestContext contentRequest;
    try {
        contentRequest = getContentRequest(request);
    } catch (final Exception e) {
        request.setAttribute("title", "Error");
        request.setAttribute("messages", "Unable to interpret content request.");
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    }
    if (contentRequest.getDataUri() == null) {
        request.setAttribute("title", "Error");
        request.setAttribute("messages", "The data reference must be specified.");
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    }
    // get the content
    final DownloadableContent downloadableContent;
    try {
        downloadableContent = contentAccess.getContent(contentRequest);
    } catch (final ResourceNotFoundException rnfe) {
        request.setAttribute("title", "Error");
        request.setAttribute("messages", "Unable to find the specified content");
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    } catch (final AccessDeniedException ade) {
        request.setAttribute("title", "Access Denied");
        request.setAttribute("messages", "Unable to approve access to the specified content: " + ade.getMessage());
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    } catch (final Exception e) {
        request.setAttribute("title", "Error");
        request.setAttribute("messages", "An unexpected error has occurred: " + e.getMessage());
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    }
    // determine how we want to view the data
    String mode = request.getParameter("mode");
    // if the name isn't set, use original
    if (mode == null) {
        mode = DisplayMode.Original.name();
    }
    // determine the display mode
    final DisplayMode displayMode;
    try {
        displayMode = DisplayMode.valueOf(mode);
    } catch (final IllegalArgumentException iae) {
        request.setAttribute("title", "Error");
        request.setAttribute("messages", "Invalid display mode: " + mode);
        // forward to the error page
        final ServletContext viewerContext = servletContext.getContext("/nifi");
        viewerContext.getRequestDispatcher("/message").forward(request, response);
        return;
    }
    // buffer the content to support resetting in case we need to detect the content type or char encoding
    try (final BufferedInputStream bis = new BufferedInputStream(downloadableContent.getContent())) {
        final String mimeType;
        final String normalizedMimeType;
        // when clustered and we don't know the type set to octet stream since the content was retrieved from the node's rest endpoint
        if (downloadableContent.getType() == null || StringUtils.startsWithIgnoreCase(downloadableContent.getType(), MediaType.OCTET_STREAM.toString())) {
            // attempt to detect the content stream if we don't know what it is ()
            final DefaultDetector detector = new DefaultDetector();
            // create the stream for tika to process, buffered to support reseting
            final TikaInputStream tikaStream = TikaInputStream.get(bis);
            // provide a hint based on the filename
            final Metadata metadata = new Metadata();
            metadata.set(Metadata.RESOURCE_NAME_KEY, downloadableContent.getFilename());
            // Get mime type
            final MediaType mediatype = detector.detect(tikaStream, metadata);
            mimeType = mediatype.toString();
        } else {
            mimeType = downloadableContent.getType();
        }
        // Extract only mime type and subtype from content type (anything after the first ; are parameters)
        // Lowercase so subsequent code does not need to implement case insensitivity
        normalizedMimeType = mimeType.split(";", 2)[0].toLowerCase();
        // add attributes needed for the header
        request.setAttribute("filename", downloadableContent.getFilename());
        request.setAttribute("contentType", mimeType);
        // generate the header
        request.getRequestDispatcher("/WEB-INF/jsp/header.jsp").include(request, response);
        // remove the attributes needed for the header
        request.removeAttribute("filename");
        request.removeAttribute("contentType");
        // generate the markup for the content based on the display mode
        if (DisplayMode.Hex.equals(displayMode)) {
            final byte[] buffer = new byte[BUFFER_LENGTH];
            final int read = StreamUtils.fillBuffer(bis, buffer, false);
            // trim the byte array if necessary
            byte[] bytes = buffer;
            if (read != buffer.length) {
                bytes = new byte[read];
                System.arraycopy(buffer, 0, bytes, 0, read);
            }
            // convert bytes into the base 64 bytes
            final String base64 = Base64.encodeBase64String(bytes);
            // defer to the jsp
            request.setAttribute("content", base64);
            request.getRequestDispatcher("/WEB-INF/jsp/hexview.jsp").include(request, response);
        } else {
            // lookup a viewer for the content
            final String contentViewerUri = servletContext.getInitParameter(normalizedMimeType);
            // handle no viewer for content type
            if (contentViewerUri == null) {
                request.getRequestDispatcher("/WEB-INF/jsp/no-viewer.jsp").include(request, response);
            } else {
                // create a request attribute for accessing the content
                request.setAttribute(ViewableContent.CONTENT_REQUEST_ATTRIBUTE, new ViewableContent() {

                    @Override
                    public InputStream getContentStream() {
                        return bis;
                    }

                    @Override
                    public String getContent() throws IOException {
                        // detect the charset
                        final CharsetDetector detector = new CharsetDetector();
                        detector.setText(bis);
                        detector.enableInputFilter(true);
                        final CharsetMatch match = detector.detect();
                        // ensure we were able to detect the charset
                        if (match == null) {
                            throw new IOException("Unable to detect character encoding.");
                        }
                        // convert the stream using the detected charset
                        return IOUtils.toString(bis, match.getName());
                    }

                    @Override
                    public ViewableContent.DisplayMode getDisplayMode() {
                        return displayMode;
                    }

                    @Override
                    public String getFileName() {
                        return downloadableContent.getFilename();
                    }

                    @Override
                    public String getContentType() {
                        return normalizedMimeType;
                    }

                    @Override
                    public String getRawContentType() {
                        return mimeType;
                    }
                });
                try {
                    // generate the content
                    final ServletContext viewerContext = servletContext.getContext(contentViewerUri);
                    viewerContext.getRequestDispatcher("/view-content").include(request, response);
                } catch (final Exception e) {
                    String message = e.getMessage() != null ? e.getMessage() : e.toString();
                    message = "Unable to generate view of data: " + message;
                    // log the error
                    logger.error(message);
                    if (logger.isDebugEnabled()) {
                        logger.error(StringUtils.EMPTY, e);
                    }
                    // populate the request attributes
                    request.setAttribute("title", "Error");
                    request.setAttribute("messages", message);
                    // forward to the error page
                    final ServletContext viewerContext = servletContext.getContext("/nifi");
                    viewerContext.getRequestDispatcher("/message").forward(request, response);
                    return;
                }
                // remove the request attribute
                request.removeAttribute(ViewableContent.CONTENT_REQUEST_ATTRIBUTE);
            }
        }
        // generate footer
        request.getRequestDispatcher("/WEB-INF/jsp/footer.jsp").include(request, response);
    }
}
Also used : DefaultDetector(org.apache.tika.detect.DefaultDetector) AccessDeniedException(org.apache.nifi.authorization.AccessDeniedException) BufferedInputStream(java.io.BufferedInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) CharsetDetector(com.ibm.icu.text.CharsetDetector) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) IOException(java.io.IOException) ServletException(javax.servlet.ServletException) AccessDeniedException(org.apache.nifi.authorization.AccessDeniedException) IOException(java.io.IOException) DisplayMode(org.apache.nifi.web.ViewableContent.DisplayMode) CharsetMatch(com.ibm.icu.text.CharsetMatch) BufferedInputStream(java.io.BufferedInputStream) ServletContext(javax.servlet.ServletContext) MediaType(org.apache.tika.mime.MediaType)

Example 5 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class ExternalParser method parse.

private void parse(TikaInputStream stream, XHTMLContentHandler xhtml, Metadata metadata, TemporaryResources tmp) throws IOException, SAXException, TikaException {
    boolean inputToStdIn = true;
    boolean outputFromStdOut = true;
    boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty());
    File output = null;
    // Build our command
    String[] cmd;
    if (command.length == 1) {
        cmd = command[0].split(" ");
    } else {
        cmd = new String[command.length];
        System.arraycopy(command, 0, cmd, 0, command.length);
    }
    for (int i = 0; i < cmd.length; i++) {
        if (cmd[i].indexOf(INPUT_FILE_TOKEN) != -1) {
            cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, stream.getFile().getPath());
            inputToStdIn = false;
        }
        if (cmd[i].indexOf(OUTPUT_FILE_TOKEN) != -1) {
            output = tmp.createTemporaryFile();
            outputFromStdOut = false;
            cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath());
        }
    }
    // Execute
    Process process = null;
    try {
        if (cmd.length == 1) {
            process = Runtime.getRuntime().exec(cmd[0]);
        } else {
            process = Runtime.getRuntime().exec(cmd);
        }
    } catch (Exception e) {
        e.printStackTrace();
    }
    try {
        if (inputToStdIn) {
            sendInput(process, stream);
        } else {
            process.getOutputStream().close();
        }
        InputStream out = process.getInputStream();
        InputStream err = process.getErrorStream();
        if (hasPatterns) {
            extractMetadata(err, metadata);
            if (outputFromStdOut) {
                extractOutput(out, xhtml);
            } else {
                extractMetadata(out, metadata);
            }
        } else {
            ignoreStream(err);
            if (outputFromStdOut) {
                extractOutput(out, xhtml);
            } else {
                ignoreStream(out);
            }
        }
    } finally {
        try {
            process.waitFor();
        } catch (InterruptedException ignore) {
        }
    }
    // Grab the output if we haven't already
    if (!outputFromStdOut) {
        extractOutput(new FileInputStream(output), xhtml);
    }
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) File(java.io.File) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException) FileInputStream(java.io.FileInputStream)

Aggregations

TikaInputStream (org.apache.tika.io.TikaInputStream)100 Metadata (org.apache.tika.metadata.Metadata)40 TemporaryResources (org.apache.tika.io.TemporaryResources)28 IOException (java.io.IOException)27 TikaException (org.apache.tika.exception.TikaException)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)23 Test (org.junit.Test)20 InputStream (java.io.InputStream)19 File (java.io.File)15 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)15 ContentHandler (org.xml.sax.ContentHandler)14 TikaTest (org.apache.tika.TikaTest)13 MediaType (org.apache.tika.mime.MediaType)13 SAXException (org.xml.sax.SAXException)13 ParseContext (org.apache.tika.parser.ParseContext)12 ParserContainerExtractor (org.apache.tika.extractor.ParserContainerExtractor)8 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)6 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)6 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)6 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6