use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.
the class PDFParserTest method testEmbeddedFilesInChildren.
// TIKA-1228, TIKA-1268
@Test
public void testEmbeddedFilesInChildren() throws Exception {
String xml = getXML("/testPDF_childAttachments.pdf").xml;
//"regressiveness" exists only in Unit10.doc not in the container pdf document
assertTrue(xml.contains("regressiveness"));
RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
ParseContext context = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
config.setExtractUniqueInlineImagesOnly(false);
context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
context.set(org.apache.tika.parser.Parser.class, p);
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
}
List<Metadata> metadatas = p.getMetadata();
assertEquals(5, metadatas.size());
assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE));
assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
}
Aggregations