Search in sources :

Example 1 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class TikaTest method getRecursiveMetadata.

protected List<Metadata> getRecursiveMetadata(String filePath, ParseContext context) throws Exception {
    Parser p = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    try (InputStream is = getResourceAsStream("/test-documents/" + filePath)) {
        wrapper.parse(is, new DefaultHandler(), new Metadata(), context);
    }
    return wrapper.getMetadata();
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 2 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveParserWrapperTest method testCharLimit.

@Test
public void testCharLimit() throws Exception {
    ParseContext context = new ParseContext();
    Metadata metadata = new Metadata();
    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
    InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    List<Metadata> list = wrapper.getMetadata();
    assertEquals(5, list.size());
    int wlr = 0;
    for (Metadata m : list) {
        String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
        if (limitReached != null && limitReached.equals("true")) {
            wlr++;
        }
    }
    assertEquals(1, wlr);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 3 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveParserWrapperTest method testBasicHTML.

@Test
public void testBasicHTML() throws Exception {
    List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
    Metadata container = list.get(0);
    String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
    //not much differentiates html from xml in this test file
    assertTrue(content.indexOf("<p class=\"header\"></p>") > -1);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) Test(org.junit.Test)

Example 4 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveParserWrapperTest method testIgnoreContent.

@Test
public void testIgnoreContent() throws Exception {
    List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
    Metadata container = list.get(0);
    String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
    assertNull(content);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) Test(org.junit.Test)

Example 5 with BasicContentHandlerFactory

use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.

the class RecursiveParserWrapperTest method testEmbeddedResourcePath.

@Test
public void testEmbeddedResourcePath() throws Exception {
    Set<String> targets = new HashSet<String>();
    targets.add("/embed1.zip");
    targets.add("/embed1.zip/embed2.zip");
    targets.add("/embed1.zip/embed2.zip/embed3.zip");
    targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip");
    targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip/embed4.txt");
    targets.add("/embed1.zip/embed2.zip/embed3.zip/embed3.txt");
    targets.add("/embed1.zip/embed2.zip/embed2a.txt");
    targets.add("/embed1.zip/embed2.zip/embed2b.txt");
    targets.add("/embed1.zip/embed1b.txt");
    targets.add("/embed1.zip/embed1a.txt");
    targets.add("/image1.emf");
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
    List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    Metadata container = list.get(0);
    String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
    assertTrue(content.indexOf("<p class=\"header\" />") > -1);
    Set<String> seen = new HashSet<String>();
    for (Metadata m : list) {
        String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
        if (path != null) {
            seen.add(path);
        }
    }
    assertEquals(targets, seen);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

BasicContentHandlerFactory (org.apache.tika.sax.BasicContentHandlerFactory)22 Metadata (org.apache.tika.metadata.Metadata)21 Test (org.junit.Test)16 InputStream (java.io.InputStream)10 TikaInputStream (org.apache.tika.io.TikaInputStream)9 RecursiveParserWrapper (org.apache.tika.parser.RecursiveParserWrapper)9 ParseContext (org.apache.tika.parser.ParseContext)8 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)7 Parser (org.apache.tika.parser.Parser)7 DefaultHandler (org.xml.sax.helpers.DefaultHandler)7 TikaTest (org.apache.tika.TikaTest)6 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)4 ByteArrayInputStream (java.io.ByteArrayInputStream)3 IOException (java.io.IOException)3 InputStreamReader (java.io.InputStreamReader)2 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)2 RecursiveParserWrapperFSConsumer (org.apache.tika.batch.fs.RecursiveParserWrapperFSConsumer)2 TikaConfig (org.apache.tika.config.TikaConfig)2 EmptyParser (org.apache.tika.parser.EmptyParser)2 ContentHandler (org.xml.sax.ContentHandler)2