Search in sources :

Example 96 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class Seven7ParserTest method testPasswordProtected.

@Test
public void testPasswordProtected() throws Exception {
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    // No password, will fail with EncryptedDocumentException
    boolean ex = false;
    try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
        parser.parse(stream, handler, metadata, recursingContext);
        fail("Shouldn't be able to read a password protected 7z without the password");
    } catch (EncryptedDocumentException e) {
        // Good
        ex = true;
    }
    assertTrue("test no password", ex);
    ex = false;
    // Wrong password currently silently gives no content
    // Ideally we'd like Commons Compress to give an error, but it doesn't...
    recursingContext.set(PasswordProvider.class, new PasswordProvider() {

        @Override
        public String getPassword(Metadata metadata) {
            return "wrong";
        }
    });
    handler = new BodyContentHandler();
    try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
        parser.parse(stream, handler, metadata, recursingContext);
        fail("Shouldn't be able to read a password protected 7z with wrong password");
    } catch (TikaException e) {
        //if JCE is installed, the cause will be: Caused by: org.tukaani.xz.CorruptedInputException: Compressed data is corrupt
        //if JCE is not installed, the message will include
        // "(do you have the JCE  Unlimited Strength Jurisdiction Policy Files installed?")
        ex = true;
    }
    assertTrue("TikaException for bad password", ex);
    // Will be empty
    assertEquals("", handler.toString());
    ex = false;
    // Right password works fine if JCE Unlimited Strength has been installed!!!
    if (isStrongCryptoAvailable()) {
        recursingContext.set(PasswordProvider.class, new PasswordProvider() {

            @Override
            public String getPassword(Metadata metadata) {
                return "Tika";
            }
        });
        handler = new BodyContentHandler();
        try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
            parser.parse(stream, handler, metadata, recursingContext);
        }
        assertEquals(TYPE_7ZIP.toString(), metadata.get(Metadata.CONTENT_TYPE));
        String content = handler.toString();
        // Should get filename
        assertContains("text.txt", content);
        // Should get contents from the text file in the 7z file
        assertContains("TEST DATA FOR TIKA.", content);
        assertContains("This is text inside an encrypted 7zip (7z) file.", content);
        assertContains("It should be processed by Tika just fine!", content);
        assertContains("TIKA-1521", content);
    } else {
        //if jce is not installed, test for IOException wrapped in TikaException
        boolean ioe = false;
        recursingContext.set(PasswordProvider.class, new PasswordProvider() {

            @Override
            public String getPassword(Metadata metadata) {
                return "Tika";
            }
        });
        handler = new BodyContentHandler();
        try (InputStream stream = Seven7ParserTest.class.getResourceAsStream("/test-documents/test7Z_protected_passTika.7z")) {
            parser.parse(stream, handler, metadata, recursingContext);
        } catch (TikaException e) {
            ioe = true;
        }
        assertTrue("IOException because JCE was not installed", ioe);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) EncryptedDocumentException(org.apache.tika.exception.EncryptedDocumentException) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) PasswordProvider(org.apache.tika.parser.PasswordProvider) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 97 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class PDFParserTest method testSkipBadPage.

@Test
public void testSkipBadPage() throws Exception {
    //test file comes from govdocs1
    //can't use TikaTest shortcuts because of exception
    Parser p = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler(-1);
    Metadata m = new Metadata();
    ParseContext context = new ParseContext();
    boolean tikaEx = false;
    try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
        p.parse(is, handler, m, context);
    } catch (TikaException e) {
        tikaEx = true;
    }
    String content = handler.toString();
    assertTrue("Should have thrown exception", tikaEx);
    assertEquals(1, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
    assertContains("Unknown dir", m.get(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING));
    assertContains("1309.61", content);
    //now try throwing exception immediately
    PDFParserConfig config = new PDFParserConfig();
    config.setCatchIntermediateIOExceptions(false);
    context.set(PDFParserConfig.class, config);
    handler = new BodyContentHandler(-1);
    m = new Metadata();
    tikaEx = false;
    try (InputStream is = getResourceAsStream("/test-documents/testPDF_bad_page_303226.pdf")) {
        p.parse(is, handler, m, context);
    } catch (TikaException e) {
        tikaEx = true;
    }
    content = handler.toString();
    assertTrue("Should have thrown exception", tikaEx);
    assertEquals(0, m.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING).length);
    assertNotContained("1309.61", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaException(org.apache.tika.exception.TikaException) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParser(org.apache.tika.parser.ocr.TesseractOCRParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 98 with TikaException

use of org.apache.tika.exception.TikaException in project stanbol by apache.

the class TikaEngine method computeEnhancements.

@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
    MediaTypeAndStream mtas = extractMediaType(ci);
    if (mtas.mediaType == null) {
        //unable to parse and detect content type
        return;
    }
    MediaType plainMediaType = mtas.mediaType.getBaseType();
    if (plainMediaType.equals(MediaType.TEXT_PLAIN)) {
        //we need not to process plain text!
        return;
    }
    final ParseContext context = new ParseContext();
    context.set(Parser.class, parser);
    Set<MediaType> supproted = parser.getSupportedTypes(context);
    if (supproted.contains(plainMediaType)) {
        final InputStream in;
        if (mtas.in == null) {
            in = ci.getStream();
        } else {
            in = mtas.in;
        }
        final Metadata metadata = new Metadata();
        //set the already parsed contentType
        metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
        //also explicitly set the charset as contentEncoding
        String charset = mtas.mediaType.getParameters().get("charset");
        if (charset != null) {
            metadata.set(Metadata.CONTENT_ENCODING, charset);
        }
        ContentSink plainTextSink;
        try {
            plainTextSink = ciFactory.createContentSink(TEXT_PLAIN + "; charset=" + UTF8.name());
        } catch (IOException e) {
            //close the input stream
            IOUtils.closeQuietly(in);
            throw new EngineException("Error while initialising Blob for" + "writing the text/plain version of the parsed content", e);
        }
        final Writer plainTextWriter = new OutputStreamWriter(plainTextSink.getOutputStream(), UTF8);
        final ContentHandler textHandler = new //only the Body
        BodyContentHandler(//skip ignoreable
        new PlainTextHandler(plainTextWriter, false, skipLinebreaks));
        final ToXMLContentHandler xhtmlHandler;
        final ContentHandler mainHandler;
        ContentSink xhtmlSink = null;
        try {
            if (!plainMediaType.equals(XHTML)) {
                //do not parse XHTML from XHTML
                try {
                    xhtmlSink = ciFactory.createContentSink(XHTML + "; charset=" + UTF8.name());
                } catch (IOException e) {
                    throw new EngineException("Error while initialising Blob for" + "writing the application/xhtml+xml version of the parsed content", e);
                }
                try {
                    xhtmlHandler = new ToXMLContentHandler(xhtmlSink.getOutputStream(), UTF8.name());
                } catch (UnsupportedEncodingException e) {
                    throw new EngineException("This system does not support the encoding " + UTF8, e);
                }
                mainHandler = new MultiHandler(textHandler, xhtmlHandler);
            } else {
                mainHandler = textHandler;
                xhtmlHandler = null;
                xhtmlSink = null;
            }
            try {
                AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {

                    public Object run() throws IOException, SAXException, TikaException {
                        /* 
                             * We need to replace the context Classloader with the Bundle ClassLoader
                             * to ensure that Singleton instances of XML frameworks (such as node4j) 
                             * do not leak into the OSGI environment.
                             * 
                             * Most Java XML libs prefer to load implementations by using the 
                             * {@link Thread#getContextClassLoader()}. However OSGI has no control over
                             * this {@link ClassLoader}. Because of that there can be situations where
                             * Interfaces are loaded via the Bundle Classloader and the implementations
                             * are taken from the context Classloader. What can cause 
                             * {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
                             * 
                             * Setting the context Classloader to the Bundle classloader helps to avoid
                             * those situations.
                             */
                        ClassLoader contextClassLoader = updateContextClassLoader();
                        try {
                            parser.parse(in, mainHandler, metadata, context);
                        } finally {
                            //reset the previous context ClassLoader
                            Thread.currentThread().setContextClassLoader(contextClassLoader);
                        }
                        return null;
                    }
                });
            } catch (PrivilegedActionException pae) {
                Exception e = pae.getException();
                if (e instanceof IOException || e instanceof SAXException || e instanceof TikaException) {
                    throw new EngineException("Unable to convert ContentItem " + ci.getUri() + " with mimeType '" + ci.getMimeType() + "' to " + "plain text!", e);
                } else {
                    //runtime exception
                    throw RuntimeException.class.cast(e);
                }
            }
        } finally {
            //ensure that the writers are closed correctly
            IOUtils.closeQuietly(in);
            IOUtils.closeQuietly(plainTextWriter);
            if (xhtmlSink != null) {
                IOUtils.closeQuietly(xhtmlSink.getOutputStream());
            }
        }
        String random = randomUUID().toString();
        IRI textBlobUri = new IRI("urn:tika:text:" + random);
        ci.addPart(textBlobUri, plainTextSink.getBlob());
        if (xhtmlHandler != null) {
            IRI xhtmlBlobUri = new IRI("urn:tika:xhtml:" + random);
            ci.addPart(xhtmlBlobUri, xhtmlSink.getBlob());
        }
        //add the extracted metadata
        if (log.isInfoEnabled()) {
            for (String name : metadata.names()) {
                log.info("{}: {}", name, Arrays.toString(metadata.getValues(name)));
            }
        }
        ci.getLock().writeLock().lock();
        try {
            Graph graph = ci.getMetadata();
            IRI id = ci.getUri();
            Set<String> mapped = ontologyMappings.apply(graph, id, metadata);
            if (includeUnmappedProperties) {
                Set<String> unmapped = new HashSet<String>(Arrays.asList(metadata.names()));
                unmapped.removeAll(mapped);
                for (String name : unmapped) {
                    if (name.indexOf(':') >= 0 || includeAllUnmappedProperties) {
                        //only mapped
                        IRI prop = new IRI(new StringBuilder(TIKA_URN_PREFIX).append(name).toString());
                        for (String value : metadata.getValues(name)) {
                            //TODO: without the Property for the name we have no datatype
                            //      information ... so we add PlainLiterals for now
                            graph.add(new TripleImpl(id, prop, new PlainLiteralImpl(value)));
                        }
                    }
                }
            }
        } finally {
            ci.getLock().writeLock().unlock();
        }
    }
//else not supported format
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) IRI(org.apache.clerezza.commons.rdf.IRI) Metadata(org.apache.tika.metadata.Metadata) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) SAXException(org.xml.sax.SAXException) MediaType(org.apache.tika.mime.MediaType) TripleImpl(org.apache.clerezza.commons.rdf.impl.utils.TripleImpl) HashSet(java.util.HashSet) MultiHandler(org.apache.stanbol.enhancer.engines.tika.handler.MultiHandler) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) TikaException(org.apache.tika.exception.TikaException) PrivilegedActionException(java.security.PrivilegedActionException) PlainLiteralImpl(org.apache.clerezza.commons.rdf.impl.utils.PlainLiteralImpl) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) EngineException(org.apache.stanbol.enhancer.servicesapi.EngineException) ConfigurationException(org.osgi.service.cm.ConfigurationException) SAXException(org.xml.sax.SAXException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) TikaException(org.apache.tika.exception.TikaException) PrivilegedActionException(java.security.PrivilegedActionException) IOException(java.io.IOException) Graph(org.apache.clerezza.commons.rdf.Graph) PlainTextHandler(org.apache.stanbol.enhancer.engines.tika.handler.PlainTextHandler) ParseContext(org.apache.tika.parser.ParseContext) OutputStreamWriter(java.io.OutputStreamWriter) ContentSink(org.apache.stanbol.enhancer.servicesapi.ContentSink) Writer(java.io.Writer) OutputStreamWriter(java.io.OutputStreamWriter)

Example 99 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class ExternalEmbedder method embed.

/**
     * Executes the configured external command and passes the given document
     * stream as a simple XHTML document to the given SAX content handler.
     * Metadata is only extracted if {@link #setMetadataCommandArguments(Map)}
     * has been called to set arguments.
     */
public void embed(final Metadata metadata, final InputStream inputStream, final OutputStream outputStream, final ParseContext context) throws IOException, TikaException {
    boolean inputToStdIn = true;
    boolean outputFromStdOut = true;
    boolean hasMetadataCommandArguments = (metadataCommandArguments != null && !metadataCommandArguments.isEmpty());
    boolean serializeMetadataCommandArgumentsToken = false;
    boolean replacedMetadataCommandArgumentsToken = false;
    TikaInputStream tikaInputStream = TikaInputStream.get(inputStream);
    File tempOutputFile = null;
    List<String> commandMetadataSegments = null;
    if (hasMetadataCommandArguments) {
        commandMetadataSegments = getCommandMetadataSegments(metadata);
    }
    // Build our command
    List<String> origCmd = Arrays.asList(command);
    List<String> cmd = new ArrayList<String>();
    for (String commandSegment : origCmd) {
        if (commandSegment.indexOf(ExternalParser.INPUT_FILE_TOKEN) != -1) {
            commandSegment = commandSegment.replace(ExternalParser.INPUT_FILE_TOKEN, tikaInputStream.getFile().toString());
            inputToStdIn = false;
        }
        if (commandSegment.indexOf(ExternalParser.OUTPUT_FILE_TOKEN) != -1) {
            tempOutputFile = tmp.createTemporaryFile();
            commandSegment = commandSegment.replace(ExternalParser.OUTPUT_FILE_TOKEN, tempOutputFile.toString());
            outputFromStdOut = false;
        }
        if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
            serializeMetadataCommandArgumentsToken = true;
        }
        if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_TOKEN) != -1) {
            if (hasMetadataCommandArguments) {
                for (String commandMetadataSegment : commandMetadataSegments) {
                    cmd.add(commandMetadataSegment);
                }
            }
            replacedMetadataCommandArgumentsToken = true;
        } else {
            cmd.add(commandSegment);
        }
    }
    if (hasMetadataCommandArguments) {
        if (serializeMetadataCommandArgumentsToken) {
            // Find all metadata tokens and replace with encapsulated metadata
            int i = 0;
            for (String commandSegment : cmd) {
                if (commandSegment.indexOf(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN) != -1) {
                    commandSegment = commandSegment.replace(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, serializeMetadata(commandMetadataSegments));
                    cmd.set(i, commandSegment);
                }
                i++;
            }
        } else if (!replacedMetadataCommandArgumentsToken && !serializeMetadataCommandArgumentsToken) {
            // Tack metadata onto the end of the cmd as arguments
            cmd.addAll(commandMetadataSegments);
        }
    }
    // Execute
    Process process;
    if (cmd.toArray().length == 1) {
        process = Runtime.getRuntime().exec(cmd.toArray(new String[] {})[0]);
    } else {
        process = Runtime.getRuntime().exec(cmd.toArray(new String[] {}));
    }
    ByteArrayOutputStream stdErrOutputStream = new ByteArrayOutputStream();
    try {
        sendStdErrToOutputStream(process, stdErrOutputStream);
        if (inputToStdIn) {
            sendInputStreamToStdIn(inputStream, process);
        } else {
            // We're not writing to std in this case so close
            process.getOutputStream().close();
        }
        if (outputFromStdOut) {
            sendStdOutToOutputStream(process, outputStream);
        } else {
            tmp.dispose();
            try {
                process.waitFor();
            } catch (InterruptedException ignore) {
            }
            // The command is finished, read the output file into the given output stream
            InputStream tempOutputFileInputStream = TikaInputStream.get(tempOutputFile);
            IOUtils.copy(tempOutputFileInputStream, outputStream);
        }
    } finally {
        if (outputFromStdOut) {
            try {
                process.waitFor();
            } catch (InterruptedException ignore) {
            }
        } else {
            try {
                // Clean up temp output files
                tempOutputFile.delete();
            } catch (Exception e) {
            }
        }
        if (!inputToStdIn) {
            // Close input file (and delete if created by up TemporaryResources.createTemporaryFile) 
            IOUtils.closeQuietly(tikaInputStream);
        }
        IOUtils.closeQuietly(outputStream);
        IOUtils.closeQuietly(stdErrOutputStream);
        if (process.exitValue() != 0) {
            throw new TikaException("There was an error executing the command line" + "\nExecutable Command:\n\n" + cmd + "\nExecutable Error:\n\n" + stdErrOutputStream.toString(UTF_8.name()));
        }
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) TikaInputStream(org.apache.tika.io.TikaInputStream) ByteArrayOutputStream(java.io.ByteArrayOutputStream) IOException(java.io.IOException) TikaException(org.apache.tika.exception.TikaException) File(java.io.File)

Example 100 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class ParserContainerExtractor method extract.

public void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, EmbeddedResourceHandler handler) throws IOException, TikaException {
    ParseContext context = new ParseContext();
    context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
    try {
        parser.parse(stream, new DefaultHandler(), new Metadata(), context);
    } catch (SAXException e) {
        throw new TikaException("Unexpected SAX exception", e);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) DefaultHandler(org.xml.sax.helpers.DefaultHandler) SAXException(org.xml.sax.SAXException)

Aggregations

TikaException (org.apache.tika.exception.TikaException)142 IOException (java.io.IOException)54 SAXException (org.xml.sax.SAXException)42 InputStream (java.io.InputStream)37 TikaInputStream (org.apache.tika.io.TikaInputStream)33 Metadata (org.apache.tika.metadata.Metadata)33 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)29 Test (org.junit.Test)19 ParseContext (org.apache.tika.parser.ParseContext)18 ContentHandler (org.xml.sax.ContentHandler)17 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)16 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)15 TemporaryResources (org.apache.tika.io.TemporaryResources)15 MediaType (org.apache.tika.mime.MediaType)13 Parser (org.apache.tika.parser.Parser)13 ByteArrayInputStream (java.io.ByteArrayInputStream)12 ArrayList (java.util.ArrayList)11 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)11 File (java.io.File)8 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)8