Search in sources :

Example 31 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class Pkcs7ParserTest method testDetachedSignature.

public void testDetachedSignature() throws Exception {
    try (InputStream input = Pkcs7ParserTest.class.getResourceAsStream("/test-documents/testDetached.p7s")) {
        ContentHandler handler = new BodyContentHandler();
        Metadata metadata = new Metadata();
        new Pkcs7Parser().parse(input, handler, metadata, new ParseContext());
    } catch (NullPointerException npe) {
        fail("should not get NPE");
    } catch (TikaException te) {
        assertTrue(te.toString().contains("cannot parse detached pkcs7 signature"));
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 32 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class ForkParserIntegrationTest method testParsingErrorInForkedParserShouldBeReported.

/**
     * TIKA-831 Parsers throwing errors should be caught and
     *  properly reported
     */
@Test
public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
    BrokenParser brokenParser = new BrokenParser();
    ForkParser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
    InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
    // With a serializable error, we'll get that back
    try {
        ContentHandler output = new BodyContentHandler();
        ParseContext context = new ParseContext();
        parser.parse(stream, output, new Metadata(), context);
        fail("Expected TikaException caused by Error");
    } catch (TikaException e) {
        assertEquals(brokenParser.err, e.getCause());
    } finally {
        parser.close();
    }
    // With a non serializable one, we'll get something else
    // TODO Fix this test
    brokenParser = new BrokenParser();
    brokenParser.re = new WontBeSerializedError("Can't Serialize");
    parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
//        try {
//           ContentHandler output = new BodyContentHandler();
//           ParseContext context = new ParseContext();
//           parser.parse(stream, output, new Metadata(), context);
//           fail("Expected TikaException caused by Error");
//       } catch (TikaException e) {
//           assertEquals(TikaException.class, e.getCause().getClass());
//           assertEquals("Bang!", e.getCause().getMessage());
//       }
}
Also used : ForkParser(org.apache.tika.fork.ForkParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 33 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class ForkParserIntegrationTest method testParserHandlingOfNonSerializable.

/**
     * If we supply a non serializable object on the ParseContext,
     *  check we get a helpful exception back
     */
@Test
public void testParserHandlingOfNonSerializable() throws Exception {
    ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
    ParseContext context = new ParseContext();
    context.set(Detector.class, new Detector() {

        public MediaType detect(InputStream input, Metadata metadata) {
            return MediaType.OCTET_STREAM;
        }
    });
    try {
        ContentHandler output = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
        parser.parse(stream, output, new Metadata(), context);
        fail("Should have blown up with a non serializable ParseContext");
    } catch (TikaException e) {
        // Check the right details
        assertNotNull(e.getCause());
        assertEquals(NotSerializableException.class, e.getCause().getClass());
        assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
    } finally {
        parser.close();
    }
}
Also used : ForkParser(org.apache.tika.fork.ForkParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) NotSerializableException(java.io.NotSerializableException) Detector(org.apache.tika.detect.Detector) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 34 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class ForkParserIntegrationTest method testAttachingADebuggerOnTheForkedParserShouldWork.

/**
     * TIKA-832
     */
@Test
public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception {
    ParseContext context = new ParseContext();
    context.set(Parser.class, tika.getParser());
    ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
    parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug", "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
    try {
        ContentHandler body = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
        parser.parse(stream, body, new Metadata(), context);
        String content = body.toString();
        assertContains("Test d'indexation", content);
        assertContains("http://www.apache.org", content);
    } finally {
        parser.close();
    }
}
Also used : ForkParser(org.apache.tika.fork.ForkParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 35 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class TikaResource method createParser.

@SuppressWarnings("serial")
public static Parser createParser() {
    final Parser parser = new AutoDetectParser(tikaConfig);
    Map<MediaType, Parser> parsers = ((AutoDetectParser) parser).getParsers();
    parsers.put(MediaType.APPLICATION_XML, new HtmlParser());
    ((AutoDetectParser) parser).setParsers(parsers);
    ((AutoDetectParser) parser).setFallback(new Parser() {

        public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
            return parser.getSupportedTypes(parseContext);
        }

        public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) {
            throw new WebApplicationException(Response.Status.UNSUPPORTED_MEDIA_TYPE);
        }
    });
    if (digester != null) {
        return new DigestingParser(parser, digester);
    }
    return parser;
}
Also used : HtmlParser(org.apache.tika.parser.html.HtmlParser) Set(java.util.Set) WebApplicationException(javax.ws.rs.WebApplicationException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MediaType(org.apache.tika.mime.MediaType) DigestingParser(org.apache.tika.parser.DigestingParser) BoilerpipeContentHandler(org.apache.tika.parser.html.BoilerpipeContentHandler) ExpandedTitleContentHandler(org.apache.tika.sax.ExpandedTitleContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) RichTextContentHandler(org.apache.tika.sax.RichTextContentHandler) Parser(org.apache.tika.parser.Parser) HtmlParser(org.apache.tika.parser.html.HtmlParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DigestingParser(org.apache.tika.parser.DigestingParser)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)338 Metadata (org.apache.tika.metadata.Metadata)283 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)164 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)118 Parser (org.apache.tika.parser.Parser)109 ByteArrayInputStream (java.io.ByteArrayInputStream)92 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)30 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)25 SAXException (org.xml.sax.SAXException)25 CompositeParser (org.apache.tika.parser.CompositeParser)22 TeeContentHandler (org.apache.tika.sax.TeeContentHandler)20