Search in sources :

Example 71 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class RecursiveParserWrapperTest method testCharLimit.

@Test
public void testCharLimit() throws Exception {
    ParseContext context = new ParseContext();
    Metadata metadata = new Metadata();
    Parser wrapped = new AutoDetectParser();
    RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
    InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
    wrapper.parse(stream, new DefaultHandler(), metadata, context);
    List<Metadata> list = wrapper.getMetadata();
    assertEquals(5, list.size());
    int wlr = 0;
    for (Metadata m : list) {
        String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
        if (limitReached != null && limitReached.equals("true")) {
            wlr++;
        }
    }
    assertEquals(1, wlr);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) DefaultHandler(org.xml.sax.helpers.DefaultHandler) Test(org.junit.Test)

Example 72 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class RecursiveParserWrapperTest method testBasicHTML.

@Test
public void testBasicHTML() throws Exception {
    List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
    Metadata container = list.get(0);
    String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
    //not much differentiates html from xml in this test file
    assertTrue(content.indexOf("<p class=\"header\"></p>") > -1);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) Test(org.junit.Test)

Example 73 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class RecursiveParserWrapperTest method testIgnoreContent.

@Test
public void testIgnoreContent() throws Exception {
    List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
    Metadata container = list.get(0);
    String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
    assertNull(content);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) Test(org.junit.Test)

Example 74 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class ClassParserTest method testClassParsing.

@Test
public void testClassParsing() throws Exception {
    String path = "/test-documents/AutoDetectParser.class";
    Metadata metadata = new Metadata();
    String content = new Tika().parseToString(ClassParserTest.class.getResourceAsStream(path), metadata);
    assertEquals("AutoDetectParser", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("AutoDetectParser.class", metadata.get(Metadata.RESOURCE_NAME_KEY));
    assertTrue(content.contains("package org.apache.tika.parser;"));
    assertTrue(content.contains("class AutoDetectParser extends CompositeParser"));
    assertTrue(content.contains("private org.apache.tika.mime.MimeTypes types"));
    assertTrue(content.contains("public void parse(" + "java.io.InputStream, org.xml.sax.ContentHandler," + " org.apache.tika.metadata.Metadata) throws" + " java.io.IOException, org.xml.sax.SAXException," + " org.apache.tika.exception.TikaException;"));
    assertTrue(content.contains("private byte[] getPrefix(java.io.InputStream, int)" + " throws java.io.IOException;"));
}
Also used : Metadata(org.apache.tika.metadata.Metadata) Tika(org.apache.tika.Tika) Test(org.junit.Test)

Example 75 with Metadata

use of org.apache.tika.metadata.Metadata in project tika by apache.

the class RecursiveParserWrapperTest method testEmbeddedResourcePath.

@Test
public void testEmbeddedResourcePath() throws Exception {
    Set<String> targets = new HashSet<String>();
    targets.add("/embed1.zip");
    targets.add("/embed1.zip/embed2.zip");
    targets.add("/embed1.zip/embed2.zip/embed3.zip");
    targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip");
    targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip/embed4.txt");
    targets.add("/embed1.zip/embed2.zip/embed3.zip/embed3.txt");
    targets.add("/embed1.zip/embed2.zip/embed2a.txt");
    targets.add("/embed1.zip/embed2.zip/embed2b.txt");
    targets.add("/embed1.zip/embed1b.txt");
    targets.add("/embed1.zip/embed1a.txt");
    targets.add("/image1.emf");
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
    List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
    Metadata container = list.get(0);
    String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
    assertTrue(content.indexOf("<p class=\"header\" />") > -1);
    Set<String> seen = new HashSet<String>();
    for (Metadata m : list) {
        String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
        if (path != null) {
            seen.add(path);
        }
    }
    assertEquals(targets, seen);
}
Also used : BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)651 Test (org.junit.Test)467 InputStream (java.io.InputStream)320 ParseContext (org.apache.tika.parser.ParseContext)283 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)269 TikaTest (org.apache.tika.TikaTest)257 ContentHandler (org.xml.sax.ContentHandler)229 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)154 ByteArrayInputStream (java.io.ByteArrayInputStream)143 Parser (org.apache.tika.parser.Parser)136 TikaInputStream (org.apache.tika.io.TikaInputStream)133 IOException (java.io.IOException)66 DefaultHandler (org.xml.sax.helpers.DefaultHandler)59 TikaException (org.apache.tika.exception.TikaException)48 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)36 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)36 StringWriter (java.io.StringWriter)33 Tika (org.apache.tika.Tika)29 MediaType (org.apache.tika.mime.MediaType)29 SAXException (org.xml.sax.SAXException)29