Search in sources :

Example 6 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class ParseContext method getDocumentBuilder.

/**
     * Returns the DOM builder specified in this parsing context.
     * If a builder is not explicitly specified, then a builder
     * instance is created and returned. The builder instance is
     * configured to apply an {@link #IGNORING_SAX_ENTITY_RESOLVER},
     * and it sets the ErrorHandler to <code>null</code>.
     *
     * @since Apache Tika 1.13
     * @return DOM Builder
     */
public DocumentBuilder getDocumentBuilder() throws TikaException {
    DocumentBuilder documentBuilder = get(DocumentBuilder.class);
    if (documentBuilder != null) {
        return documentBuilder;
    }
    try {
        DocumentBuilderFactory documentBuilderFactory = getDocumentBuilderFactory();
        documentBuilder = documentBuilderFactory.newDocumentBuilder();
        documentBuilder.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
        documentBuilder.setErrorHandler(null);
        return documentBuilder;
    } catch (ParserConfigurationException e) {
        throw new TikaException("XML parser not available", e);
    }
}
Also used : DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) TikaException(org.apache.tika.exception.TikaException) DocumentBuilder(javax.xml.parsers.DocumentBuilder) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException)

Example 7 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class ParseContext method getXMLReader.

/**
     * Returns the XMLReader specified in this parsing context. If a reader
     * is not explicitly specified, then one is created using the specified
     * or the default SAX parser.
     *
     * @see #getSAXParser()
     * @since Apache Tika 1.13
     * @return XMLReader
     * @throws TikaException
     */
public XMLReader getXMLReader() throws TikaException {
    XMLReader reader = get(XMLReader.class);
    if (reader != null) {
        return reader;
    }
    try {
        reader = getSAXParser().getXMLReader();
    } catch (SAXException e) {
        throw new TikaException("Unable to create an XMLReader", e);
    }
    reader.setEntityResolver(IGNORING_SAX_ENTITY_RESOLVER);
    return reader;
}
Also used : TikaException(org.apache.tika.exception.TikaException) XMLReader(org.xml.sax.XMLReader) SAXException(org.xml.sax.SAXException)

Example 8 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class TikaConfigTest method testUnknownParser.

/**
     * Make sure that with a service loader given, we can
     * get different configurable behaviour on parser classes
     * which can't be found.
     */
@Test
public void testUnknownParser() throws Exception {
    ServiceLoader ignoreLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.IGNORE);
    ServiceLoader warnLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.WARN);
    ServiceLoader throwLoader = new ServiceLoader(getClass().getClassLoader(), LoadErrorHandler.THROW);
    Path configPath = Paths.get(new URI(getConfigPath("TIKA-1700-unknown-parser.xml")));
    TikaConfig ignore = new TikaConfig(configPath, ignoreLoader);
    assertNotNull(ignore);
    assertNotNull(ignore.getParser());
    assertEquals(1, ((CompositeParser) ignore.getParser()).getAllComponentParsers().size());
    TikaConfig warn = new TikaConfig(configPath, warnLoader);
    assertNotNull(warn);
    assertNotNull(warn.getParser());
    assertEquals(1, ((CompositeParser) warn.getParser()).getAllComponentParsers().size());
    try {
        new TikaConfig(configPath, throwLoader);
        fail("Shouldn't get here, invalid parser class");
    } catch (TikaException expected) {
    }
}
Also used : Path(java.nio.file.Path) TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) URI(java.net.URI) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 9 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class TikaConfigTest method defaultParserWithExcludes.

/**
     * TIKA-1445 It should be possible to exclude DefaultParser from
     *  certain types, so another parser explicitly listed will take them
     */
@Test
public void defaultParserWithExcludes() throws Exception {
    try {
        TikaConfig config = getConfig("TIKA-1445-default-except.xml");
        CompositeParser cp = (CompositeParser) config.getParser();
        List<Parser> parsers = cp.getAllComponentParsers();
        Parser p;
        // Will be the three parsers defined in the xml
        assertEquals(3, parsers.size());
        // Should have a wrapped DefaultParser, not the main DefaultParser,
        //  as it is excluded from handling certain classes
        p = parsers.get(0);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(DefaultParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        // Should have two others which claim things, which they wouldn't
        //  otherwise handle
        p = parsers.get(1);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(EmptyParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("hello/world", p.getSupportedTypes(null).iterator().next().toString());
        p = parsers.get(2);
        assertTrue(p.toString(), p instanceof ParserDecorator);
        assertEquals(ErrorParser.class, ((ParserDecorator) p).getWrappedParser().getClass());
        assertEquals("fail/world", p.getSupportedTypes(null).iterator().next().toString());
    } catch (TikaException e) {
        fail("Unexpected TikaException: " + e);
    }
}
Also used : TikaException(org.apache.tika.exception.TikaException) TikaConfig(org.apache.tika.config.TikaConfig) CompositeParser(org.apache.tika.parser.CompositeParser) ParserDecorator(org.apache.tika.parser.ParserDecorator) Parser(org.apache.tika.parser.Parser) CompositeParser(org.apache.tika.parser.CompositeParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultParser(org.apache.tika.parser.DefaultParser) EmptyParser(org.apache.tika.parser.EmptyParser) ErrorParser(org.apache.tika.parser.ErrorParser) Test(org.junit.Test) TikaConfigTest(org.apache.tika.config.TikaConfigTest)

Example 10 with TikaException

use of org.apache.tika.exception.TikaException in project tika by apache.

the class RTFObjDataParser method parse.

/**
     * Parses the embedded object/pict string
     *
     * @param bytes actual bytes (already converted from the 
     *  hex pair string stored in the embedded object data into actual bytes or read
     *  as raw binary bytes)
     * @return a SimpleRTFEmbObj or null
     * @throws IOException if there are any surprise surprises during parsing
     */
/**
     * @param bytes
     * @param metadata             incoming metadata
     * @param unknownFilenameCount
     * @return byte[] for contents of obj data
     * @throws IOException
     */
protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger unknownFilenameCount) throws IOException, TikaException {
    ByteArrayInputStream is = new ByteArrayInputStream(bytes);
    long version = readUInt(is);
    metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));
    long formatId = readUInt(is);
    //2 is an embedded object. 1 is a link.
    if (formatId != 2L) {
        return null;
    }
    String className = readLengthPrefixedAnsiString(is).trim();
    String topicName = readLengthPrefixedAnsiString(is).trim();
    String itemName = readLengthPrefixedAnsiString(is).trim();
    if (className != null && className.length() > 0) {
        metadata.add(RTFMetadata.EMB_CLASS, className);
    }
    if (topicName != null && topicName.length() > 0) {
        metadata.add(RTFMetadata.EMB_TOPIC, topicName);
    }
    if (itemName != null && itemName.length() > 0) {
        metadata.add(RTFMetadata.EMB_ITEM, itemName);
    }
    long dataSz = readUInt(is);
    //readBytes tests for reading too many bytes
    byte[] embObjBytes = readBytes(is, dataSz);
    if (className.toLowerCase(Locale.ROOT).equals("package")) {
        return handlePackage(embObjBytes, metadata);
    } else if (className.toLowerCase(Locale.ROOT).equals("pbrush")) {
        //simple bitmap bytes
        return embObjBytes;
    } else {
        ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes);
        boolean hasPoifs = false;
        try {
            hasPoifs = NPOIFSFileSystem.hasPOIFSHeader(embIs);
        } catch (IOException e) {
            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
            return embObjBytes;
        }
        if (hasPoifs) {
            try {
                return handleEmbeddedPOIFS(embIs, metadata, unknownFilenameCount);
            } catch (Exception e) {
                EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
            }
        }
    }
    return embObjBytes;
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) IOException(java.io.IOException) Ole10NativeException(org.apache.poi.poifs.filesystem.Ole10NativeException) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) TikaMemoryLimitException(org.apache.tika.exception.TikaMemoryLimitException)

Aggregations

TikaException (org.apache.tika.exception.TikaException)142 IOException (java.io.IOException)54 SAXException (org.xml.sax.SAXException)42 InputStream (java.io.InputStream)37 TikaInputStream (org.apache.tika.io.TikaInputStream)33 Metadata (org.apache.tika.metadata.Metadata)33 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)29 Test (org.junit.Test)19 ParseContext (org.apache.tika.parser.ParseContext)18 ContentHandler (org.xml.sax.ContentHandler)17 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)16 CloseShieldInputStream (org.apache.commons.io.input.CloseShieldInputStream)15 TemporaryResources (org.apache.tika.io.TemporaryResources)15 MediaType (org.apache.tika.mime.MediaType)13 Parser (org.apache.tika.parser.Parser)13 ByteArrayInputStream (java.io.ByteArrayInputStream)12 ArrayList (java.util.ArrayList)11 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)11 File (java.io.File)8 EmbeddedContentHandler (org.apache.tika.sax.EmbeddedContentHandler)8