use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class RecursiveParserWrapperTest method testBasicXML.
@Test
public void testBasicXML() throws Exception {
List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
Metadata container = list.get(0);
String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
//not much differentiates html from xml in this test file
assertTrue(content.indexOf("<p class=\"header\" />") > -1);
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class RecursiveParserWrapperTest method testEmbeddedNPE.
@Test
public void testEmbeddedNPE() throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
//default behavior (user doesn't specify whether or not to catch embedded exceptions
//is to catch the exception
assertEquals(13, list.size());
Metadata mockNPEMetadata = list.get(10);
assertContains("java.lang.NullPointerException", mockNPEMetadata.get(RecursiveParserWrapper.EMBEDDED_EXCEPTION));
metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded_npe.docx");
list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), false, null);
//Composite parser swallows caught TikaExceptions, IOExceptions and SAXExceptions
//and just doesn't bother to report that there was an exception.
assertEquals(12, list.size());
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class RecursiveParserWrapperTest method testBasicText.
@Test
public void testBasicText() throws Exception {
List<Metadata> list = getMetadata(new Metadata(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));
Metadata container = list.get(0);
String content = container.get(RecursiveParserWrapper.TIKA_CONTENT);
assertTrue(content.indexOf("<p ") < 0);
assertTrue(content.indexOf("embed_0") > -1);
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class RecursiveParserWrapperTest method testDigesters.
@Test
public void testDigesters() throws Exception {
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
List<Metadata> list = getMetadata(metadata, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1), true, new CommonsDigester(100000, CommonsDigester.DigestAlgorithm.MD5));
int i = 0;
Metadata m0 = list.get(0);
Metadata m6 = list.get(6);
String md5Key = "X-TIKA:digest:MD5";
assertEquals("59f626e09a8c16ab6dbc2800c685f772", list.get(0).get(md5Key));
assertEquals("ccdf3882e7e4c2454e28884db9b0a54d", list.get(6).get(md5Key));
assertEquals("a869bf6432ebd14e19fc79416274e0c9", list.get(7).get(md5Key));
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class SQLite3ParserTest method testRecursiveParserWrapper.
@Test
public void testRecursiveParserWrapper() throws Exception {
Parser p = new AutoDetectParser();
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.BODY, -1));
Metadata metadata = new Metadata();
try (InputStream is = getResourceAsStream(TEST_FILE1)) {
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
wrapper.parse(is, new BodyContentHandler(-1), metadata, new ParseContext());
}
List<Metadata> metadataList = wrapper.getMetadata();
int i = 0;
assertEquals(5, metadataList.size());
//make sure the \t are inserted in a body handler
String table = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
assertContains("0\t2.3\t2.4\tlorem", table);
assertContains("普林斯顿大学", table);
//make sure the \n is inserted
String table2 = metadataList.get(0).get(RecursiveParserWrapper.TIKA_CONTENT);
assertContains("do eiusmod tempor\n", table2);
assertContains("The quick brown fox", metadataList.get(2).get(RecursiveParserWrapper.TIKA_CONTENT));
assertContains("The quick brown fox", metadataList.get(4).get(RecursiveParserWrapper.TIKA_CONTENT));
//confirm .doc was added to blob
assertEquals("/BYTES_COL_0.doc/image1.png", metadataList.get(1).get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH));
}
Aggregations