Search in sources :

Example 11 with ParserContainerExtractor

use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.

the class POIContainerExtractionTest method testPowerpointImages.

@Test
public void testPowerpointImages() throws Exception {
    ContainerExtractor extractor = new ParserContainerExtractor();
    TrackingHandler handler;
    handler = process("pictures.ppt", extractor, false);
    assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
    assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
}
Also used : MediaType(org.apache.tika.mime.MediaType) ContainerExtractor(org.apache.tika.extractor.ContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) Test(org.junit.Test)

Example 12 with ParserContainerExtractor

use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.

the class POIContainerExtractionTest method testEmbeddedImages.

/**
     * Office files with embedded images, but no other
     * office files in them
     */
@Test
public void testEmbeddedImages() throws Exception {
    ContainerExtractor extractor = new ParserContainerExtractor();
    TrackingHandler handler;
    // Excel with 1 image
    handler = process("testEXCEL_1img.xls", extractor, false);
    assertEquals(1, handler.filenames.size());
    assertEquals(1, handler.mediaTypes.size());
    assertEquals(null, handler.filenames.get(0));
    assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
    // PowerPoint with 2 images + sound
    // TODO
    // Word with 1 image
    handler = process("testWORD_1img.doc", extractor, false);
    assertEquals(1, handler.filenames.size());
    assertEquals(1, handler.mediaTypes.size());
    assertEquals("image1.png", handler.filenames.get(0));
    assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
    // Word with 3 images
    handler = process("testWORD_3imgs.doc", extractor, false);
    assertEquals(3, handler.filenames.size());
    assertEquals(3, handler.mediaTypes.size());
    assertEquals("image1.png", handler.filenames.get(0));
    assertEquals("image2.jpg", handler.filenames.get(1));
    assertEquals("image3.png", handler.filenames.get(2));
    assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
    assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
    assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
}
Also used : ContainerExtractor(org.apache.tika.extractor.ContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) Test(org.junit.Test)

Example 13 with ParserContainerExtractor

use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.

the class POIContainerExtractionTest method testEmbeddedOfficeFiles.

/**
     * Office files which have other office files
     * embedded into them. The embedded office files
     * will sometimes have images in them.
     * <p/>
     * eg xls
     * -> word
     * -> image
     * -> image
     * -> powerpoint
     * -> excel
     * -> image
     */
@Test
public void testEmbeddedOfficeFiles() throws Exception {
    ContainerExtractor extractor = new ParserContainerExtractor();
    TrackingHandler handler;
    // Excel with a word doc and a powerpoint doc, both of which have images in them
    // Without recursion, should see both documents + the images
    handler = process("testEXCEL_embeded.xls", extractor, false);
    assertEquals(5, handler.filenames.size());
    assertEquals(5, handler.mediaTypes.size());
    // We don't know their filenames
    assertEquals(null, handler.filenames.get(0));
    assertEquals(null, handler.filenames.get(1));
    assertEquals(null, handler.filenames.get(2));
    assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
    assertEquals("MBD00032A24.doc", handler.filenames.get(4));
    // But we do know their types
    // Icon of embedded office doc
    assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
    // Icon of embedded office doc
    assertEquals(TYPE_EMF, handler.mediaTypes.get(1));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
    // Embedded office doc
    assertEquals(TYPE_PPT, handler.mediaTypes.get(3));
    // Embedded office doc
    assertEquals(TYPE_DOC, handler.mediaTypes.get(4));
    // With recursion, should get the images embedded in the office files too
    handler = process("testEXCEL_embeded.xls", extractor, true);
    assertEquals(17, handler.filenames.size());
    assertEquals(17, handler.mediaTypes.size());
    assertEquals(null, handler.filenames.get(0));
    assertEquals(null, handler.filenames.get(1));
    assertEquals(null, handler.filenames.get(2));
    assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
    assertEquals("1", handler.filenames.get(4));
    assertEquals(null, handler.filenames.get(5));
    assertEquals("2", handler.filenames.get(6));
    assertEquals("image1.png", handler.filenames.get(7));
    assertEquals("image2.jpg", handler.filenames.get(8));
    assertEquals("image3.png", handler.filenames.get(9));
    assertEquals("image1.png", handler.filenames.get(16));
    // Icon of embedded office doc
    assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
    // Icon of embedded office doc
    assertEquals(TYPE_EMF, handler.mediaTypes.get(1));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
    // Embedded presentation
    assertEquals(TYPE_PPT, handler.mediaTypes.get(3));
    // Embedded XLS
    assertEquals(TYPE_XLS, handler.mediaTypes.get(4));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(5));
    // Embedded office doc
    assertEquals(TYPE_DOC, handler.mediaTypes.get(6));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(7));
    // Embedded image
    assertEquals(TYPE_JPG, handler.mediaTypes.get(8));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(9));
    // Embedded office doc
    assertEquals(TYPE_DOC, handler.mediaTypes.get(15));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(16));
    // Word with .docx, powerpoint and excel
    handler = process("testWORD_embeded.doc", extractor, false);
    assertEquals(9, handler.filenames.size());
    assertEquals(9, handler.mediaTypes.size());
    // Filenames are a bit iffy...
    // Should really be 3*embedded pictures then 3*icons then embedded docs
    assertEquals("image1.emf", handler.filenames.get(0));
    assertEquals("image4.png", handler.filenames.get(1));
    assertEquals("image5.jpg", handler.filenames.get(2));
    assertEquals("image6.png", handler.filenames.get(3));
    assertEquals("image2.emf", handler.filenames.get(4));
    assertEquals("image3.emf", handler.filenames.get(5));
    assertEquals(null, handler.filenames.get(6));
    assertEquals("_1345471035.ppt", handler.filenames.get(7));
    assertEquals("_1345470949.xls", handler.filenames.get(8));
    // But we do know their types
    // Icon of embedded office doc?
    assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
    // Embedded image - logo
    assertEquals(TYPE_PNG, handler.mediaTypes.get(1));
    // Embedded image - safe
    assertEquals(TYPE_JPG, handler.mediaTypes.get(2));
    // Embedded image - try
    assertEquals(TYPE_PNG, handler.mediaTypes.get(3));
    // Icon of embedded office doc?
    assertEquals(TYPE_EMF, handler.mediaTypes.get(4));
    // Icon of embedded office doc?
    assertEquals(TYPE_EMF, handler.mediaTypes.get(5));
    // Embedded office doc
    assertEquals(TYPE_DOCX, handler.mediaTypes.get(6));
    // Embedded office doc
    assertEquals(TYPE_PPT, handler.mediaTypes.get(7));
    // Embedded office doc
    assertEquals(TYPE_XLS, handler.mediaTypes.get(8));
    // With recursion, should get their images too
    handler = process("testWORD_embeded.doc", extractor, true);
    assertEquals(16, handler.filenames.size());
    assertEquals(16, handler.mediaTypes.size());
    // We don't know their filenames, except for doc images + docx
    assertEquals("image1.emf", handler.filenames.get(0));
    assertEquals("image4.png", handler.filenames.get(1));
    assertEquals("image5.jpg", handler.filenames.get(2));
    assertEquals("image6.png", handler.filenames.get(3));
    assertEquals("image2.emf", handler.filenames.get(4));
    assertEquals("image3.emf", handler.filenames.get(5));
    assertEquals(null, handler.filenames.get(6));
    assertEquals("image2.png", handler.filenames.get(7));
    assertEquals("image3.jpeg", handler.filenames.get(8));
    assertEquals("image4.png", handler.filenames.get(9));
    for (int i = 11; i < 14; i++) {
        assertNull(handler.filenames.get(i));
    }
    // But we do know their types
    // Icon of embedded office doc
    assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
    // Embedded image - logo
    assertEquals(TYPE_PNG, handler.mediaTypes.get(1));
    // Embedded image - safe
    assertEquals(TYPE_JPG, handler.mediaTypes.get(2));
    // Embedded image - try
    assertEquals(TYPE_PNG, handler.mediaTypes.get(3));
    // Icon of embedded office doc
    assertEquals(TYPE_EMF, handler.mediaTypes.get(4));
    // Icon of embedded office doc
    assertEquals(TYPE_EMF, handler.mediaTypes.get(5));
    // Embedded office doc
    assertEquals(TYPE_DOCX, handler.mediaTypes.get(6));
    //    PNG inside .docx
    assertEquals(TYPE_PNG, handler.mediaTypes.get(7));
    //    JPG inside .docx
    assertEquals(TYPE_JPG, handler.mediaTypes.get(8));
    //    PNG inside .docx
    assertEquals(TYPE_PNG, handler.mediaTypes.get(9));
    // Embedded office doc
    assertEquals(TYPE_PPT, handler.mediaTypes.get(10));
    // Embedded office doc
    assertEquals(TYPE_XLS, handler.mediaTypes.get(14));
    //    PNG inside .xls
    assertEquals(TYPE_PNG, handler.mediaTypes.get(15));
    // PowerPoint with excel and word
    handler = process("testPPT_embeded.ppt", extractor, false);
    assertEquals(7, handler.filenames.size());
    assertEquals(7, handler.mediaTypes.size());
    // We don't get all that helpful filenames
    assertEquals("1", handler.filenames.get(0));
    assertEquals("2", handler.filenames.get(1));
    assertEquals(null, handler.filenames.get(2));
    assertEquals(null, handler.filenames.get(3));
    assertEquals(null, handler.filenames.get(4));
    assertEquals(null, handler.filenames.get(5));
    assertEquals(null, handler.filenames.get(6));
    // But we do know their types
    // Embedded office doc
    assertEquals(TYPE_XLS, handler.mediaTypes.get(0));
    // Embedded office doc
    assertEquals(TYPE_DOC, handler.mediaTypes.get(1));
    // Icon of embedded office doc
    assertEquals(TYPE_EMF, handler.mediaTypes.get(2));
    // Icon of embedded office doc
    assertEquals(TYPE_EMF, handler.mediaTypes.get(3));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(4));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(5));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(6));
    // Run again on PowerPoint but with recursion
    handler = process("testPPT_embeded.ppt", extractor, true);
    assertEquals(11, handler.filenames.size());
    assertEquals(11, handler.mediaTypes.size());
    assertEquals("1", handler.filenames.get(0));
    assertEquals(null, handler.filenames.get(1));
    assertEquals("2", handler.filenames.get(2));
    assertEquals("image1.png", handler.filenames.get(3));
    assertEquals("image2.jpg", handler.filenames.get(4));
    assertEquals("image3.png", handler.filenames.get(5));
    assertEquals(null, handler.filenames.get(6));
    assertEquals(null, handler.filenames.get(7));
    assertEquals(null, handler.filenames.get(8));
    assertEquals(null, handler.filenames.get(9));
    assertEquals(null, handler.filenames.get(10));
    // Embedded office doc
    assertEquals(TYPE_XLS, handler.mediaTypes.get(0));
    //    PNG inside .xls
    assertEquals(TYPE_PNG, handler.mediaTypes.get(1));
    // Embedded office doc
    assertEquals(TYPE_DOC, handler.mediaTypes.get(2));
    //    PNG inside .docx
    assertEquals(TYPE_PNG, handler.mediaTypes.get(3));
    //    JPG inside .docx
    assertEquals(TYPE_JPG, handler.mediaTypes.get(4));
    //    PNG inside .docx
    assertEquals(TYPE_PNG, handler.mediaTypes.get(5));
    // Icon of embedded office doc
    assertEquals(TYPE_EMF, handler.mediaTypes.get(6));
    // Icon of embedded office doc
    assertEquals(TYPE_EMF, handler.mediaTypes.get(7));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(8));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(9));
    // Embedded image
    assertEquals(TYPE_PNG, handler.mediaTypes.get(10));
    // Word, with a non-office file (PDF)
    handler = process("testWORD_embedded_pdf.doc", extractor, true);
    assertEquals(2, handler.filenames.size());
    assertEquals(2, handler.mediaTypes.size());
    assertEquals("image1.emf", handler.filenames.get(0));
    assertEquals("_1402837031.pdf", handler.filenames.get(1));
    // Icon of embedded pdf
    assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
    // The embedded PDF itself
    assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
    // Outlook with a text file and a word document
    handler = process("testMSG_att_doc.msg", extractor, true);
    assertEquals(2, handler.filenames.size());
    assertEquals(2, handler.mediaTypes.size());
    assertEquals("test-unicode.doc", handler.filenames.get(0));
    assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
    assertEquals("pj1.txt", handler.filenames.get(1));
    assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
    // Outlook with a pdf and another outlook message
    handler = process("testMSG_att_msg.msg", extractor, true);
    assertEquals(2, handler.filenames.size());
    assertEquals(2, handler.mediaTypes.size());
    assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
    assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
    assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
    assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
}
Also used : ContainerExtractor(org.apache.tika.extractor.ContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) Test(org.junit.Test)

Example 14 with ParserContainerExtractor

use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.

the class RTFParserTest method testEmbeddedLinkedDocument.

//TIKA-1010 test linked embedded doc
@Test
public void testEmbeddedLinkedDocument() throws Exception {
    Set<MediaType> skipTypes = new HashSet<MediaType>();
    skipTypes.add(MediaType.parse("image/emf"));
    skipTypes.add(MediaType.parse("image/wmf"));
    TrackingHandler tracker = new TrackingHandler(skipTypes);
    try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
        ContainerExtractor ex = new ParserContainerExtractor();
        assertEquals(true, ex.isSupported(tis));
        ex.extract(tis, ex, tracker);
    }
    //should gracefully skip link and not throw NPE, IOEx, etc
    assertEquals(0, tracker.filenames.size());
    tracker = new TrackingHandler();
    try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
        ContainerExtractor ex = new ParserContainerExtractor();
        assertEquals(true, ex.isSupported(tis));
        ex.extract(tis, ex, tracker);
    }
    //should gracefully skip link and not throw NPE, IOEx, etc
    assertEquals(2, tracker.filenames.size());
}
Also used : MediaType(org.apache.tika.mime.MediaType) TikaInputStream(org.apache.tika.io.TikaInputStream) ContainerExtractor(org.apache.tika.extractor.ContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) HashSet(java.util.HashSet) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 15 with ParserContainerExtractor

use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.

the class RTFParserTest method testBinControlWord.

// TIKA-782
@Test
public void testBinControlWord() throws Exception {
    ByteCopyingHandler embHandler = new ByteCopyingHandler();
    try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) {
        ContainerExtractor ex = new ParserContainerExtractor();
        assertEquals(true, ex.isSupported(tis));
        ex.extract(tis, ex, embHandler);
    }
    assertEquals(1, embHandler.bytes.size());
    byte[] bytes = embHandler.bytes.get(0);
    assertEquals(10, bytes.length);
    //}
    assertEquals(125, (int) bytes[4]);
    //make sure that at least the last value is correct
    assertEquals(-1, (int) bytes[9]);
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) ContainerExtractor(org.apache.tika.extractor.ContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

ParserContainerExtractor (org.apache.tika.extractor.ParserContainerExtractor)15 Test (org.junit.Test)13 ContainerExtractor (org.apache.tika.extractor.ContainerExtractor)11 TikaInputStream (org.apache.tika.io.TikaInputStream)8 TikaTest (org.apache.tika.TikaTest)6 InputStream (java.io.InputStream)2 TrackingHandler (org.apache.tika.TikaTest.TrackingHandler)2 Metadata (org.apache.tika.metadata.Metadata)2 MediaType (org.apache.tika.mime.MediaType)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 Tika (org.apache.tika.Tika)1 MimeTypeException (org.apache.tika.mime.MimeTypeException)1 TesseractOCRParserTest (org.apache.tika.parser.ocr.TesseractOCRParserTest)1 Before (org.junit.Before)1 ConvertedDocument (org.opensextant.xtext.ConvertedDocument)1