Search in sources :

Example 81 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class ForkParserIntegrationTest method testParsingErrorInForkedParserShouldBeReported.

/**
     * TIKA-831 Parsers throwing errors should be caught and
     *  properly reported
     */
@Test
public void testParsingErrorInForkedParserShouldBeReported() throws Exception {
    BrokenParser brokenParser = new BrokenParser();
    ForkParser parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
    InputStream stream = getClass().getResourceAsStream("/test-documents/testTXT.txt");
    // With a serializable error, we'll get that back
    try {
        ContentHandler output = new BodyContentHandler();
        ParseContext context = new ParseContext();
        parser.parse(stream, output, new Metadata(), context);
        fail("Expected TikaException caused by Error");
    } catch (TikaException e) {
        assertEquals(brokenParser.err, e.getCause());
    } finally {
        parser.close();
    }
    // With a non serializable one, we'll get something else
    // TODO Fix this test
    brokenParser = new BrokenParser();
    brokenParser.re = new WontBeSerializedError("Can't Serialize");
    parser = new ForkParser(ForkParser.class.getClassLoader(), brokenParser);
//        try {
//           ContentHandler output = new BodyContentHandler();
//           ParseContext context = new ParseContext();
//           parser.parse(stream, output, new Metadata(), context);
//           fail("Expected TikaException caused by Error");
//       } catch (TikaException e) {
//           assertEquals(TikaException.class, e.getCause().getClass());
//           assertEquals("Bang!", e.getCause().getMessage());
//       }
}
Also used : ForkParser(org.apache.tika.fork.ForkParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 82 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class ForkParserIntegrationTest method testParserHandlingOfNonSerializable.

/**
     * If we supply a non serializable object on the ParseContext,
     *  check we get a helpful exception back
     */
@Test
public void testParserHandlingOfNonSerializable() throws Exception {
    ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
    ParseContext context = new ParseContext();
    context.set(Detector.class, new Detector() {

        public MediaType detect(InputStream input, Metadata metadata) {
            return MediaType.OCTET_STREAM;
        }
    });
    try {
        ContentHandler output = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
        parser.parse(stream, output, new Metadata(), context);
        fail("Should have blown up with a non serializable ParseContext");
    } catch (TikaException e) {
        // Check the right details
        assertNotNull(e.getCause());
        assertEquals(NotSerializableException.class, e.getCause().getClass());
        assertEquals("Unable to serialize ParseContext to pass to the Forked Parser", e.getMessage());
    } finally {
        parser.close();
    }
}
Also used : ForkParser(org.apache.tika.fork.ForkParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) NotSerializableException(java.io.NotSerializableException) Detector(org.apache.tika.detect.Detector) TikaException(org.apache.tika.exception.TikaException) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) MediaType(org.apache.tika.mime.MediaType) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 83 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class ForkParserIntegrationTest method testAttachingADebuggerOnTheForkedParserShouldWork.

/**
     * TIKA-832
     */
@Test
public void testAttachingADebuggerOnTheForkedParserShouldWork() throws Exception {
    ParseContext context = new ParseContext();
    context.set(Parser.class, tika.getParser());
    ForkParser parser = new ForkParser(ForkParserIntegrationTest.class.getClassLoader(), tika.getParser());
    parser.setJavaCommand(Arrays.asList("java", "-Xmx32m", "-Xdebug", "-Xrunjdwp:transport=dt_socket,address=54321,server=y,suspend=n"));
    try {
        ContentHandler body = new BodyContentHandler();
        InputStream stream = ForkParserIntegrationTest.class.getResourceAsStream("/test-documents/testTXT.txt");
        parser.parse(stream, body, new Metadata(), context);
        String content = body.toString();
        assertContains("Test d'indexation", content);
        assertContains("http://www.apache.org", content);
    } finally {
        parser.close();
    }
}
Also used : ForkParser(org.apache.tika.fork.ForkParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) ParseContext(org.apache.tika.parser.ParseContext) Metadata(org.apache.tika.metadata.Metadata) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test)

Example 84 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class RecursiveParserWrapperTest method testMaxEmbedded.

@Test
public void testMaxEmbedded() throws Exception {
    int maxEmbedded = 4;
    //including outer container file
    int totalNoLimit = 12;
    ParseContext context = new ParseContext();
    Metadata metadata = new Metadata();
    String limitReached = null;
    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
    InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    List<Metadata> list = wrapper.getMetadata();
    //test default
    assertEquals(totalNoLimit, list.size());
    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertNull(limitReached);
    wrapper.reset();
    stream.close();
    //test setting value
    metadata = new Metadata();
    stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.setMaxEmbeddedResources(maxEmbedded);
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    list = wrapper.getMetadata();
    //add 1 for outer container file
    assertEquals(maxEmbedded + 1, list.size());
    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertEquals("true", limitReached);
    wrapper.reset();
    stream.close();
    //test setting value < 0
    metadata = new Metadata();
    stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.setMaxEmbeddedResources(-2);
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    assertEquals(totalNoLimit, list.size());
    limitReached = list.get(0).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_LIMIT_REACHED);
    assertNull(limitReached);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 85 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class RecursiveParserWrapperTest method testPrimaryExcWEmbedded.

@Test
public void testPrimaryExcWEmbedded() throws Exception {
    //if embedded content is handled and then
    //the parser hits an exception in the container document,
    //that the first element of the returned list is the container document
    //and the second is the embedded content
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "embedded_then_npe.xml");
    ParseContext context = new ParseContext();
    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true);
    String path = "/test-documents/mock/embedded_then_npe.xml";
    InputStream stream = null;
    boolean npe = false;
    try {
        stream = RecursiveParserWrapperTest.class.getResourceAsStream(path);
        wrapper.parse(stream, new DefaultHandler(), metadata, context);
    } catch (TikaException e) {
        if (e.getCause().getClass().equals(NullPointerException.class)) {
            npe = true;
        }
    } finally {
        IOUtils.closeQuietly(stream);
    }
    assertTrue("npe", npe);
    List<Metadata> metadataList = wrapper.getMetadata();
    assertEquals(2, metadataList.size());
    Metadata outerMetadata = metadataList.get(0);
    Metadata embeddedMetadata = metadataList.get(1);
    assertContains("main_content", outerMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
    assertEquals("embedded_then_npe.xml", outerMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
    assertEquals("Nikolai Lobachevsky", outerMetadata.get("author"));
    assertContains("some_embedded_content", embeddedMetadata.get(RecursiveParserWrapper.TIKA_CONTENT));
    assertEquals("embed1.xml", embeddedMetadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY));
    assertEquals("embeddedAuthor", embeddedMetadata.get("author"));
}
Also used : TikaException(org.apache.tika.exception.TikaException) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)651 Test (org.junit.Test)467 InputStream (java.io.InputStream)320 ParseContext (org.apache.tika.parser.ParseContext)283 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)269 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)229 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)154 ByteArrayInputStream (java.io.ByteArrayInputStream)143 Parser (org.apache.tika.parser.Parser)136 TikaInputStream (org.apache.tika.io.TikaInputStream)133 IOException (java.io.IOException)66 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)48 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)29 MediaType (org.apache.tika.mime.MediaType)29 SAXException (org.xml.sax.SAXException)29