Search in sources :

Example 71 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class RTFParserTest method testBinControlWord.

// TIKA-782
@Test
public void testBinControlWord() throws Exception {
    ByteCopyingHandler embHandler = new ByteCopyingHandler();
    try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) {
        ContainerExtractor ex = new ParserContainerExtractor();
        assertEquals(true, ex.isSupported(tis));
        ex.extract(tis, ex, embHandler);
    }
    assertEquals(1, embHandler.bytes.size());
    byte[] bytes = embHandler.bytes.get(0);
    assertEquals(10, bytes.length);
    //}
    assertEquals(125, (int) bytes[4]);
    //make sure that at least the last value is correct
    assertEquals(-1, (int) bytes[9]);
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) ContainerExtractor(org.apache.tika.extractor.ContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 72 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class ExternalEmbedder method embed.

/**
     * Executes the configured external command and passes the given document
     * stream as a simple XHTML document to the given SAX content handler.
     * Metadata is only extracted if {@link #setMetadataCommandArguments(Map)}
     * has been called to set arguments.
     */
public void embed(final Metadata metadata, final InputStream inputStream, final OutputStream outputStream, final ParseContext context) throws IOException, TikaException {
    boolean inputToStdIn = true;
    boolean outputFromStdOut = true;
    boolean hasMetadataCommandArguments = (metadataCommandArguments != null && !metadataCommandArguments.isEmpty());
    boolean serializeMetadataCommandArgumentsToken = false;
    boolean replacedMetadataCommandArgumentsToken = false;
    TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
    File tempOutputFile = null;
    List<String> commandMetadataSegments = null;
    if (hasMetadataCommandArguments) {
        commandMetadataSegments = getCommandMetadataSegments(metadata);
    }
    // Build our command
    List<String> origCmd = Arrays.asList(command);
    List<String> cmd = new ArrayList<String>();
    for (String commandSegment : origCmd) {
        if (commandSegment.indexOf(ExternalParser.INPUT_FILE_TOKEN) != -1) {
            commandSegment = commandSegment.replace(ExternalParser.INPUT_FILE_TOKEN, tikaInputStream.getFile().toString());
            inputToStdIn = false;
        }
        if (commandSegment.indexOf(ExternalParser.OUTPUT_FILE_TOKEN) != -1) {
            tempOutputFile = tmp.createTemporaryFile();
            commandSegment = commandSegment.replace(ExternalParser.OUTPUT_FILE_TOKEN, tempOutputFile.toString());
            outputFromStdOut = false;
        }
        if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
            serializeMetadataCommandArgumentsToken = true;
        }
        if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_TOKEN) != -1) {
            if (hasMetadataCommandArguments) {
                for (String commandMetadataSegment : commandMetadataSegments) {
                    cmd.add(commandMetadataSegment);
                }
            }
            replacedMetadataCommandArgumentsToken = true;
        } else {
            cmd.add(commandSegment);
        }
    }
    if (hasMetadataCommandArguments) {
        if (serializeMetadataCommandArgumentsToken) {
            // Find all metadata tokens and replace with encapsulated metadata
            int i = 0;
            for (String commandSegment : cmd) {
                if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
                    commandSegment = commandSegment.replace(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, serializeMetadata(commandMetadataSegments));
                    cmd.set(i, commandSegment);
                }
                i++;
            }
        } else if (!replacedMetadataCommandArgumentsToken && !serializeMetadataCommandArgumentsToken) {
            // Tack metadata onto the end of the cmd as arguments
            cmd.addAll(commandMetadataSegments);
        }
    }
    // Execute
    Process process;
    if (cmd.toArray().length == 1) {
        process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]);
    } else {
        process = Runtime.getRuntime().exec(cmd.toArray(new String[] {}));
    }
    ByteArrayOutputStream stdErrOutputStream = new ByteArrayOutputStream();
    try {
        sendStdErrToOutputStream(process, stdErrOutputStream);
        if (inputToStdIn) {
            sendInputStreamToStdIn(inputStream, process);
        } else {
            // We're not writing to std in this case so close
            process.getOutputStream().close();
        }
        if (outputFromStdOut) {
            sendStdOutToOutputStream(process, outputStream);
        } else {
            tmp.dispose();
            try {
                process.waitFor();
            } catch (InterruptedException ignore) {
            }
            // The command is finished, read the output file into the given output stream
            InputStream tempOutputFileInputStream = TikaInputStream.get(tempOutputFile);
            IOUtils.copy(tempOutputFileInputStream, outputStream);
        }
    } finally {
        if (outputFromStdOut) {
            try {
                process.waitFor();
            } catch (InterruptedException ignore) {
            }
        } else {
            try {
                // Clean up temp output files
                tempOutputFile.delete();
            } catch (Exception e) {
            }
        }
        if (!inputToStdIn) {
            // Close input file (and delete if created by up TemporaryResources.createTemporaryFile) 
            IOUtils.closeQuietly(tikaInputStream);
        }
        IOUtils.closeQuietly(outputStream);
        IOUtils.closeQuietly(stdErrOutputStream);
        if (process.exitValue() != 0) {
            throw new TikaException("There was an error executing the command line" + "\nExecutable Command:\n\n" + cmd + "\nExecutable Error:\n\n" + stdErrOutputStream.toString(UTF_8.name()));
        }
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) TikaInputStream(org.apache.tika.io.TikaInputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException) File(java.io.File)

Example 73 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class ParserDecorator method withFallbacks.

/**
     * Decorates the given parsers into a virtual parser, where they'll
     *  be tried in preference order until one works without error.
     * TODO Is this the right name?
     * TODO Is this the right place to put this? Should it be in CompositeParser? Elsewhere?
     * TODO Should we reset the Metadata if we try another parser?
     * TODO Should we reset the ContentHandler if we try another parser?
     * TODO Should we log/report failures anywhere?
     * @deprecated Do not use until the TODOs are resolved, see TIKA-1509
     */
public static final Parser withFallbacks(final Collection<? extends Parser> parsers, final Set<MediaType> types) {
    Parser parser = EmptyParser.INSTANCE;
    if (!parsers.isEmpty())
        parser = parsers.iterator().next();
    return new ParserDecorator(parser) {

        private static final long serialVersionUID = 1625187131782069683L;

        @Override
        public Set<MediaType> getSupportedTypes(ParseContext context) {
            return types;
        }

        @Override
        public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
            // Must have a TikaInputStream, so we can re-use it if parsing fails
            // Need to close internally created tstream to release resources
            TemporaryResources tmp = (TikaInputStream.isTikaInputStream(stream)) ? null : new TemporaryResources();
            try {
                TikaInputStream tstream = TikaInputStream.get(stream, tmp);
                tstream.getFile();
                // Try each parser in turn
                for (Parser p : parsers) {
                    tstream.mark(-1);
                    try {
                        p.parse(tstream, handler, metadata, context);
                        return;
                    } catch (Exception e) {
                    // TODO How to log / record this failure?
                    }
                    // Prepare for the next parser, if present
                    tstream.reset();
                }
            } finally {
                if (tmp != null) {
                    tmp.dispose();
                }
            }
        }

        @Override
        public String getDecorationName() {
            return "With Fallback";
        }
    };
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) TemporaryResources(org.apache.tika.io.TemporaryResources) MediaType(org.apache.tika.mime.MediaType) TikaInputStream(org.apache.tika.io.TikaInputStream) ContentHandler(org.xml.sax.ContentHandler) TikaException(org.apache.tika.exception.TikaException) SAXException(org.xml.sax.SAXException) IOException(java.io.IOException)

Example 74 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class CompositeParser method parse.

/**
     * Delegates the call to the matching component parser.
     * <p>
     * Potential {@link RuntimeException}s, {@link IOException}s and
     * {@link SAXException}s unrelated to the given input stream and content
     * handler are automatically wrapped into {@link TikaException}s to better
     * honor the {@link Parser} contract.
     */
public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    Parser parser = getParser(metadata, context);
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream taggedStream = TikaInputStream.get(stream, tmp);
        TaggedContentHandler taggedHandler = handler != null ? new TaggedContentHandler(handler) : null;
        if (parser instanceof ParserDecorator) {
            metadata.add("X-Parsed-By", ((ParserDecorator) parser).getWrappedParser().getClass().getName());
        } else {
            metadata.add("X-Parsed-By", parser.getClass().getName());
        }
        try {
            parser.parse(taggedStream, taggedHandler, metadata, context);
        } catch (RuntimeException e) {
            throw new TikaException("Unexpected RuntimeException from " + parser, e);
        } catch (IOException e) {
            taggedStream.throwIfCauseOf(e);
            throw new TikaException("TIKA-198: Illegal IOException from " + parser, e);
        } catch (SAXException e) {
            if (taggedHandler != null)
                taggedHandler.throwIfCauseOf(e);
            throw new TikaException("TIKA-237: Illegal SAXException from " + parser, e);
        }
    } finally {
        tmp.dispose();
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream) TaggedContentHandler(org.apache.tika.sax.TaggedContentHandler) IOException(java.io.IOException) SAXException(org.xml.sax.SAXException)

Example 75 with TikaInputStream

use of org.apache.tika.io.TikaInputStream in project tika by apache.

the class NetworkParser method parse.

public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
    TemporaryResources tmp = new TemporaryResources();
    try {
        TikaInputStream tis = TikaInputStream.get(stream, tmp);
        parse(tis, handler, metadata, context);
    } finally {
        tmp.dispose();
    }
}
Also used : TemporaryResources(org.apache.tika.io.TemporaryResources) TikaInputStream(org.apache.tika.io.TikaInputStream)

Aggregations

TikaInputStream (org.apache.tika.io.TikaInputStream)100 Metadata (org.apache.tika.metadata.Metadata)40 TemporaryResources (org.apache.tika.io.TemporaryResources)28 IOException (java.io.IOException)27 TikaException (org.apache.tika.exception.TikaException)24 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)23 Test (org.junit.Test)20 InputStream (java.io.InputStream)19 File (java.io.File)15 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)15 ContentHandler (org.xml.sax.ContentHandler)14 TikaTest (org.apache.tika.TikaTest)13 MediaType (org.apache.tika.mime.MediaType)13 SAXException (org.xml.sax.SAXException)13 ParseContext (org.apache.tika.parser.ParseContext)12 ParserContainerExtractor (org.apache.tika.extractor.ParserContainerExtractor)8 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)6 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)6 EncryptedDocumentException (org.apache.tika.exception.EncryptedDocumentException)6 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)6