use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class ParsingExample method recursiveParserWrapperExample.
/**
* For documents that may contain embedded documents, it might be helpful
* to create list of metadata objects, one for the container document and
* one for each embedded document. This allows easy access to both the
* extracted content and the metadata of each embedded document.
* Note that many document formats can contain embedded documents,
* including traditional container formats -- zip, tar and others -- but also
* common office document formats including: MSWord, MSExcel,
* MSPowerPoint, RTF, PDF, MSG and several others.
* <p>
* The "content" format is determined by the ContentHandlerFactory, and
* the content is stored in {@link org.apache.tika.parser.RecursiveParserWrapper#TIKA_CONTENT}
* <p>
* The drawback to the RecursiveParserWrapper is that it caches metadata and contents
* in memory. This should not be used on files whose contents are too big to be handled
* in memory.
*
* @return a list of metadata object, one each for the container file and each embedded file
* @throws IOException
* @throws SAXException
* @throws TikaException
*/
public List<Metadata> recursiveParserWrapperExample() throws IOException, SAXException, TikaException {
Parser p = new AutoDetectParser();
ContentHandlerFactory factory = new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.HTML, -1);
RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, factory);
Metadata metadata = new Metadata();
metadata.set(Metadata.RESOURCE_NAME_KEY, "test_recursive_embedded.docx");
ParseContext context = new ParseContext();
try (InputStream stream = ParsingExample.class.getResourceAsStream("test_recursive_embedded.docx")) {
wrapper.parse(stream, new DefaultHandler(), metadata, context);
}
return wrapper.getMetadata();
}
use of org.apache.tika.sax.BasicContentHandlerFactory in project tika by apache.
the class PDFParserTest method testEmbeddedFilesInChildren.
// TIKA-1228, TIKA-1268
@Test
public void testEmbeddedFilesInChildren() throws Exception {
String xml = getXML("/testPDF_childAttachments.pdf").xml;
//"regressiveness" exists only in Unit10.doc not in the container pdf document
assertTrue(xml.contains("regressiveness"));
RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
ParseContext context = new ParseContext();
PDFParserConfig config = new PDFParserConfig();
config.setExtractInlineImages(true);
config.setExtractUniqueInlineImagesOnly(false);
context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
context.set(org.apache.tika.parser.Parser.class, p);
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
}
List<Metadata> metadatas = p.getMetadata();
assertEquals(5, metadatas.size());
assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE));
assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE));
assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
}
Aggregations