Search in sources :

Example 11 with RecursiveParserWrapper

use of org.apache.tika.parser.RecursiveParserWrapper in project tika by apache.

the class PDFParserTest method testEmbeddedFilesInChildren.

// TIKA-1228, TIKA-1268
@Test
public void testEmbeddedFilesInChildren() throws Exception {
    String xml = getXML("/testPDF_childAttachments.pdf").xml;
    //"regressiveness" exists only in Unit10.doc not in the container pdf document
    assertTrue(xml.contains("regressiveness"));
    RecursiveParserWrapper p = new RecursiveParserWrapper(new AutoDetectParser(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
    ParseContext context = new ParseContext();
    PDFParserConfig config = new PDFParserConfig();
    config.setExtractInlineImages(true);
    config.setExtractUniqueInlineImagesOnly(false);
    context.set(org.apache.tika.parser.pdf.PDFParserConfig.class, config);
    context.set(org.apache.tika.parser.Parser.class, p);
    try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDF_childAttachments.pdf"))) {
        p.parse(tis, new BodyContentHandler(-1), new Metadata(), context);
    }
    List<Metadata> metadatas = p.getMetadata();
    assertEquals(5, metadatas.size());
    assertNull(metadatas.get(0).get(Metadata.RESOURCE_NAME_KEY));
    assertEquals("image0.jpg", metadatas.get(1).get(Metadata.RESOURCE_NAME_KEY));
    assertEquals("Press Quality(1).joboptions", metadatas.get(3).get(Metadata.RESOURCE_NAME_KEY));
    assertEquals("Unit10.doc", metadatas.get(4).get(Metadata.RESOURCE_NAME_KEY));
    assertEquals(MediaType.image("jpeg").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
    assertEquals(MediaType.image("tiff").toString(), metadatas.get(2).get(Metadata.CONTENT_TYPE));
    assertEquals("text/plain; charset=ISO-8859-1", metadatas.get(3).get(Metadata.CONTENT_TYPE));
    assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) BasicContentHandlerFactory(org.apache.tika.sax.BasicContentHandlerFactory) Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) RecursiveParserWrapper(org.apache.tika.parser.RecursiveParserWrapper) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

Metadata (org.apache.tika.metadata.Metadata)11 RecursiveParserWrapper (org.apache.tika.parser.RecursiveParserWrapper)11 ParseContext (org.apache.tika.parser.ParseContext)9 BasicContentHandlerFactory (org.apache.tika.sax.BasicContentHandlerFactory)9 Parser (org.apache.tika.parser.Parser)8 InputStream (java.io.InputStream)7 TikaInputStream (org.apache.tika.io.TikaInputStream)7 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)7 DefaultHandler (org.xml.sax.helpers.DefaultHandler)5 TikaTest (org.apache.tika.TikaTest)4 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)4 Test (org.junit.Test)4 OutputStreamWriter (java.io.OutputStreamWriter)2 Writer (java.io.Writer)2 EmptyParser (org.apache.tika.parser.EmptyParser)2 ContentHandler (org.xml.sax.ContentHandler)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 FileInputStream (java.io.FileInputStream)1 IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1