use of org.apache.tika.metadata.Metadata in project tika by apache.
the class RecursiveParserWrapperTest method testCharLimit.
@Test
public void testCharLimit() throws Exception {
ParseContext context = new ParseContext();
Metadata metadata = new Metadata();
Parser wrapped = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(wrapped, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
InputStream stream = RecursiveParserWrapperTest.class.getResourceAsStream("/test-documents/test_recursive_embedded.docx");
wrapper.parse(stream, new DefaultHandler(), metadata, context);
List<Metadata> list = wrapper.getMetadata();
assertEquals(5, list.size());
int wlr = 0;
for (Metadata m : list) {
String limitReached = m.get(RecursiveParserWrapper.WRITE_LIMIT_REACHED);
if (limitReached != null && limitReached.equals("true")) {
wlr++;
}
}
assertEquals(1, wlr);
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class RecursiveParserWrapperTest method testBasicHTML.
@Test
public void testBasicHTML() throws Exception {
List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1));
Metadata container = list.get(0);
String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
//not much differentiates html from xml in this test file
assertTrue(content.indexOf("<p class=\"header\"></p>") > -1);
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class RecursiveParserWrapperTest method testIgnoreContent.
@Test
public void testIgnoreContent() throws Exception {
List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
Metadata container = list.get(0);
String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
assertNull(content);
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class ClassParserTest method testClassParsing.
@Test
public void testClassParsing() throws Exception {
String path = "/test-documents/AutoDetectParser.class";
Metadata metadata = new Metadata();
String content = new Tika().parseToString(ClassParserTest.class.getResourceAsStream(path), metadata);
assertEquals("AutoDetectParser", metadata.get(TikaCoreProperties.TITLE));
assertEquals("AutoDetectParser.class", metadata.get(Metadata.RESOURCE_NAME_KEY));
assertTrue(content.contains("package org.apache.tika.parser;"));
assertTrue(content.contains("class AutoDetectParser extends CompositeParser"));
assertTrue(content.contains("private org.apache.tika.mime.MimeTypes types"));
assertTrue(content.contains("public void parse(" + "java.io.InputStream, org.xml.sax.ContentHandler," + " org.apache.tika.metadata.Metadata) throws" + " java.io.IOException, org.xml.sax.SAXException," + " org.apache.tika.exception.TikaException;"));
assertTrue(content.contains("private byte[] getPrefix(java.io.InputStream, int)" + " throws java.io.IOException;"));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class RecursiveParserWrapperTest method testEmbeddedResourcePath.
@Test
public void testEmbeddedResourcePath() throws Exception {
Set<String> targets = new HashSet<String>();
targets.add("/embed1.zip");
targets.add("/embed1.zip/embed2.zip");
targets.add("/embed1.zip/embed2.zip/embed3.zip");
targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip");
targets.add("/embed1.zip/embed2.zip/embed3.zip/embed4.zip/embed4.txt");
targets.add("/embed1.zip/embed2.zip/embed3.zip/embed3.txt");
targets.add("/embed1.zip/embed2.zip/embed2a.txt");
targets.add("/embed1.zip/embed2.zip/embed2b.txt");
targets.add("/embed1.zip/embed1b.txt");
targets.add("/embed1.zip/embed1a.txt");
targets.add("/image1.emf");
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
Metadata container = list.get(0);
String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
assertTrue(content.indexOf("<p class=\"header\" />") > -1);
Set<String> seen = new HashSet<String>();
for (Metadata m : list) {
String path = m.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH);
if (path != null) {
seen.add(path);
}
}
assertEquals(targets, seen);
}
Aggregations