Search in sources :

Example 11 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveParserWrapperTest method testBasicXML.

@Test
public void testBasicXML() throws Exception {
    List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    Metadata container = list.get(0);
    String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
    //not much differentiates html from xml in this test file
    assertTrue(content.indexOf("<p class=\"header\" />") > -1);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) Test(org.junit.Test)

Example 12 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveParserWrapperTest method testEmbeddedNPE.

@Test
public void testEmbeddedNPE() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
    List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
    //default behavior (user doesn't specify whether or not to catch embedded exceptions
    //is to catch the exception
    assertEquals(13, list.size());
    Metadata mockNPEMetadata = list.get(10);
    assertContains("java.lang.NullPointerException", mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
    metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
    list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), false, null);
    //Composite parser swallows caught TikaExceptions, IOExceptions and SAXExceptions
    //and just doesn't bother to report that there was an exception.
    assertEquals(12, list.size());
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) Test(org.junit.Test)

Example 13 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveParserWrapperTest method testBasicText.

@Test
public void testBasicText() throws Exception {
    List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
    Metadata container = list.get(0);
    String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
    assertTrue(content.indexOf("<p ") < 0);
    assertTrue(content.indexOf("embed_0") > -1);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) Test(org.junit.Test)

Example 14 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveParserWrapperTest method testDigesters.

@Test
public void testDigesters() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
    List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true, new CommonsDigester(100000, CommonsDigester.DigestAlgorithm.MD5));
    int i = 0;
    Metadata m0 = list.get(0);
    Metadata m6 = list.get(6);
    String md5Key = "X-TIKA:digest:MD5";
    assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
    assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
    assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) CommonsDigester(org.apache.tika.parser.utils.CommonsDigester) Test(org.junit.Test)

Example 15 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class SQLite3ParserTest method testRecursiveParserWrapper.

@Test
public void testRecursiveParserWrapper() throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
    Metadata metadata = new Metadata();
    try (InputStream is = getResourceAsStream(TEST_FILE1)) {
        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
        wrapper.parse(is, new BodyContentHandler(-1), metadata, new ParseContext());
    }
    List<Metadata> metadataList = wrapper.getMetadata();
    int i = 0;
    assertEquals(5, metadataList.size());
    //make sure the \t are inserted in a body handler
    String table = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
    assertContains("0\t2.3\t2.4\tlorem", table);
    assertContains("普林斯顿大学", table);
    //make sure the \n is inserted
    String table2 = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
    assertContains("do eiusmod tempor\n", table2);
    assertContains("The quick brown fox", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
    assertContains("The quick brown fox", metadataList.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));
    //confirm .doc was added to blob
    assertEquals("/BYTES_COL_0.doc/image1.png", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser) EmptyParser(org.apache.tika.parser.EmptyParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

BasicContentHandlerFactory (org.apache.tika.sax.BasicContentHandlerFactory)22 Metadata (org.apache.tika.metadata.Metadata)21 Test (org.junit.Test)16 InputStream (java.io.InputStream)10 TikaInputStream (org.apache.tika.io.TikaInputStream)9 RecursiveParserWrapper (org.apache.tika.parser.RecursiveParserWrapper)9 ParseContext (org.apache.tika.parser.ParseContext)8 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)7 Parser (org.apache.tika.parser.Parser)7 DefaultHandler (org.xml.sax.helpers.DefaultHandler)7 TikaTest (org.apache.tika.TikaTest)6 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 IOException (java.io.IOException)3 InputStreamReader (java.io.InputStreamReader)2 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)2 RecursiveParserWrapperFSConsumer (org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer)2 TikaConfig (org.apache.tika.config.TikaConfig)2 EmptyParser (org.apache.tika.parser.EmptyParser)2 ContentHandler (org.xml.sax.ContentHandler)2