use of org.apache.tika.extractor.ContainerExtractor in project tika by apache.
the class POIContainerExtractionTest method testWithoutEmbedded.
/**
* For office files which don't have anything embedded in them
*/
@Test
public void testWithoutEmbedded() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
String[] files = new String[] { "testEXCEL.xls", "testWORD.doc", "testPPT.ppt", "testVISIO.vsd", "test-outlook.msg" };
for (String file : files) {
// Process it without recursing
TrackingHandler handler = process(file, extractor, false);
// Won't have fired
assertEquals(0, handler.filenames.size());
assertEquals(0, handler.mediaTypes.size());
// Ditto with recursing
handler = process(file, extractor, true);
assertEquals(0, handler.filenames.size());
assertEquals(0, handler.mediaTypes.size());
}
}
use of org.apache.tika.extractor.ContainerExtractor in project tika by apache.
the class POIContainerExtractionTest method testPowerpointImages.
@Test
public void testPowerpointImages() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
TrackingHandler handler;
handler = process("pictures.ppt", extractor, false);
assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg")));
assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
}
use of org.apache.tika.extractor.ContainerExtractor in project tika by apache.
the class POIContainerExtractionTest method testEmbeddedImages.
/**
* Office files with embedded images, but no other
* office files in them
*/
@Test
public void testEmbeddedImages() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
TrackingHandler handler;
// Excel with 1 image
handler = process("testEXCEL_1img.xls", extractor, false);
assertEquals(1, handler.filenames.size());
assertEquals(1, handler.mediaTypes.size());
assertEquals(null, handler.filenames.get(0));
assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
// PowerPoint with 2 images + sound
// TODO
// Word with 1 image
handler = process("testWORD_1img.doc", extractor, false);
assertEquals(1, handler.filenames.size());
assertEquals(1, handler.mediaTypes.size());
assertEquals("image1.png", handler.filenames.get(0));
assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
// Word with 3 images
handler = process("testWORD_3imgs.doc", extractor, false);
assertEquals(3, handler.filenames.size());
assertEquals(3, handler.mediaTypes.size());
assertEquals("image1.png", handler.filenames.get(0));
assertEquals("image2.jpg", handler.filenames.get(1));
assertEquals("image3.png", handler.filenames.get(2));
assertEquals(TYPE_PNG, handler.mediaTypes.get(0));
assertEquals(TYPE_JPG, handler.mediaTypes.get(1));
assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
}
use of org.apache.tika.extractor.ContainerExtractor in project tika by apache.
the class POIContainerExtractionTest method testEmbeddedOfficeFiles.
/**
* Office files which have other office files
* embedded into them. The embedded office files
* will sometimes have images in them.
* <p/>
* eg xls
* -> word
* -> image
* -> image
* -> powerpoint
* -> excel
* -> image
*/
@Test
public void testEmbeddedOfficeFiles() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
TrackingHandler handler;
// Excel with a word doc and a powerpoint doc, both of which have images in them
// Without recursion, should see both documents + the images
handler = process("testEXCEL_embeded.xls", extractor, false);
assertEquals(5, handler.filenames.size());
assertEquals(5, handler.mediaTypes.size());
// We don't know their filenames
assertEquals(null, handler.filenames.get(0));
assertEquals(null, handler.filenames.get(1));
assertEquals(null, handler.filenames.get(2));
assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
assertEquals("MBD00032A24.doc", handler.filenames.get(4));
// But we do know their types
// Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
// Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(1));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
// Embedded office doc
assertEquals(TYPE_PPT, handler.mediaTypes.get(3));
// Embedded office doc
assertEquals(TYPE_DOC, handler.mediaTypes.get(4));
// With recursion, should get the images embedded in the office files too
handler = process("testEXCEL_embeded.xls", extractor, true);
assertEquals(17, handler.filenames.size());
assertEquals(17, handler.mediaTypes.size());
assertEquals(null, handler.filenames.get(0));
assertEquals(null, handler.filenames.get(1));
assertEquals(null, handler.filenames.get(2));
assertEquals("MBD0003271D.ppt", handler.filenames.get(3));
assertEquals("1", handler.filenames.get(4));
assertEquals(null, handler.filenames.get(5));
assertEquals("2", handler.filenames.get(6));
assertEquals("image1.png", handler.filenames.get(7));
assertEquals("image2.jpg", handler.filenames.get(8));
assertEquals("image3.png", handler.filenames.get(9));
assertEquals("image1.png", handler.filenames.get(16));
// Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
// Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(1));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(2));
// Embedded presentation
assertEquals(TYPE_PPT, handler.mediaTypes.get(3));
// Embedded XLS
assertEquals(TYPE_XLS, handler.mediaTypes.get(4));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(5));
// Embedded office doc
assertEquals(TYPE_DOC, handler.mediaTypes.get(6));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(7));
// Embedded image
assertEquals(TYPE_JPG, handler.mediaTypes.get(8));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(9));
// Embedded office doc
assertEquals(TYPE_DOC, handler.mediaTypes.get(15));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(16));
// Word with .docx, powerpoint and excel
handler = process("testWORD_embeded.doc", extractor, false);
assertEquals(9, handler.filenames.size());
assertEquals(9, handler.mediaTypes.size());
// Filenames are a bit iffy...
// Should really be 3*embedded pictures then 3*icons then embedded docs
assertEquals("image1.emf", handler.filenames.get(0));
assertEquals("image4.png", handler.filenames.get(1));
assertEquals("image5.jpg", handler.filenames.get(2));
assertEquals("image6.png", handler.filenames.get(3));
assertEquals("image2.emf", handler.filenames.get(4));
assertEquals("image3.emf", handler.filenames.get(5));
assertEquals(null, handler.filenames.get(6));
assertEquals("_1345471035.ppt", handler.filenames.get(7));
assertEquals("_1345470949.xls", handler.filenames.get(8));
// But we do know their types
// Icon of embedded office doc?
assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
// Embedded image - logo
assertEquals(TYPE_PNG, handler.mediaTypes.get(1));
// Embedded image - safe
assertEquals(TYPE_JPG, handler.mediaTypes.get(2));
// Embedded image - try
assertEquals(TYPE_PNG, handler.mediaTypes.get(3));
// Icon of embedded office doc?
assertEquals(TYPE_EMF, handler.mediaTypes.get(4));
// Icon of embedded office doc?
assertEquals(TYPE_EMF, handler.mediaTypes.get(5));
// Embedded office doc
assertEquals(TYPE_DOCX, handler.mediaTypes.get(6));
// Embedded office doc
assertEquals(TYPE_PPT, handler.mediaTypes.get(7));
// Embedded office doc
assertEquals(TYPE_XLS, handler.mediaTypes.get(8));
// With recursion, should get their images too
handler = process("testWORD_embeded.doc", extractor, true);
assertEquals(16, handler.filenames.size());
assertEquals(16, handler.mediaTypes.size());
// We don't know their filenames, except for doc images + docx
assertEquals("image1.emf", handler.filenames.get(0));
assertEquals("image4.png", handler.filenames.get(1));
assertEquals("image5.jpg", handler.filenames.get(2));
assertEquals("image6.png", handler.filenames.get(3));
assertEquals("image2.emf", handler.filenames.get(4));
assertEquals("image3.emf", handler.filenames.get(5));
assertEquals(null, handler.filenames.get(6));
assertEquals("image2.png", handler.filenames.get(7));
assertEquals("image3.jpeg", handler.filenames.get(8));
assertEquals("image4.png", handler.filenames.get(9));
for (int i = 11; i < 14; i++) {
assertNull(handler.filenames.get(i));
}
// But we do know their types
// Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
// Embedded image - logo
assertEquals(TYPE_PNG, handler.mediaTypes.get(1));
// Embedded image - safe
assertEquals(TYPE_JPG, handler.mediaTypes.get(2));
// Embedded image - try
assertEquals(TYPE_PNG, handler.mediaTypes.get(3));
// Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(4));
// Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(5));
// Embedded office doc
assertEquals(TYPE_DOCX, handler.mediaTypes.get(6));
// PNG inside .docx
assertEquals(TYPE_PNG, handler.mediaTypes.get(7));
// JPG inside .docx
assertEquals(TYPE_JPG, handler.mediaTypes.get(8));
// PNG inside .docx
assertEquals(TYPE_PNG, handler.mediaTypes.get(9));
// Embedded office doc
assertEquals(TYPE_PPT, handler.mediaTypes.get(10));
// Embedded office doc
assertEquals(TYPE_XLS, handler.mediaTypes.get(14));
// PNG inside .xls
assertEquals(TYPE_PNG, handler.mediaTypes.get(15));
// PowerPoint with excel and word
handler = process("testPPT_embeded.ppt", extractor, false);
assertEquals(7, handler.filenames.size());
assertEquals(7, handler.mediaTypes.size());
// We don't get all that helpful filenames
assertEquals("1", handler.filenames.get(0));
assertEquals("2", handler.filenames.get(1));
assertEquals(null, handler.filenames.get(2));
assertEquals(null, handler.filenames.get(3));
assertEquals(null, handler.filenames.get(4));
assertEquals(null, handler.filenames.get(5));
assertEquals(null, handler.filenames.get(6));
// But we do know their types
// Embedded office doc
assertEquals(TYPE_XLS, handler.mediaTypes.get(0));
// Embedded office doc
assertEquals(TYPE_DOC, handler.mediaTypes.get(1));
// Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(2));
// Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(3));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(4));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(5));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(6));
// Run again on PowerPoint but with recursion
handler = process("testPPT_embeded.ppt", extractor, true);
assertEquals(11, handler.filenames.size());
assertEquals(11, handler.mediaTypes.size());
assertEquals("1", handler.filenames.get(0));
assertEquals(null, handler.filenames.get(1));
assertEquals("2", handler.filenames.get(2));
assertEquals("image1.png", handler.filenames.get(3));
assertEquals("image2.jpg", handler.filenames.get(4));
assertEquals("image3.png", handler.filenames.get(5));
assertEquals(null, handler.filenames.get(6));
assertEquals(null, handler.filenames.get(7));
assertEquals(null, handler.filenames.get(8));
assertEquals(null, handler.filenames.get(9));
assertEquals(null, handler.filenames.get(10));
// Embedded office doc
assertEquals(TYPE_XLS, handler.mediaTypes.get(0));
// PNG inside .xls
assertEquals(TYPE_PNG, handler.mediaTypes.get(1));
// Embedded office doc
assertEquals(TYPE_DOC, handler.mediaTypes.get(2));
// PNG inside .docx
assertEquals(TYPE_PNG, handler.mediaTypes.get(3));
// JPG inside .docx
assertEquals(TYPE_JPG, handler.mediaTypes.get(4));
// PNG inside .docx
assertEquals(TYPE_PNG, handler.mediaTypes.get(5));
// Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(6));
// Icon of embedded office doc
assertEquals(TYPE_EMF, handler.mediaTypes.get(7));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(8));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(9));
// Embedded image
assertEquals(TYPE_PNG, handler.mediaTypes.get(10));
// Word, with a non-office file (PDF)
handler = process("testWORD_embedded_pdf.doc", extractor, true);
assertEquals(2, handler.filenames.size());
assertEquals(2, handler.mediaTypes.size());
assertEquals("image1.emf", handler.filenames.get(0));
assertEquals("_1402837031.pdf", handler.filenames.get(1));
// Icon of embedded pdf
assertEquals(TYPE_EMF, handler.mediaTypes.get(0));
// The embedded PDF itself
assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
// Outlook with a text file and a word document
handler = process("testMSG_att_doc.msg", extractor, true);
assertEquals(2, handler.filenames.size());
assertEquals(2, handler.mediaTypes.size());
assertEquals("test-unicode.doc", handler.filenames.get(0));
assertEquals(TYPE_DOC, handler.mediaTypes.get(0));
assertEquals("pj1.txt", handler.filenames.get(1));
assertEquals(TYPE_TXT, handler.mediaTypes.get(1));
// Outlook with a pdf and another outlook message
handler = process("testMSG_att_msg.msg", extractor, true);
assertEquals(2, handler.filenames.size());
assertEquals(2, handler.mediaTypes.size());
assertEquals("__substg1.0_3701000D.msg", handler.filenames.get(0));
assertEquals(TYPE_MSG, handler.mediaTypes.get(0));
assertEquals("smbprn.00009008.KdcPjl.pdf", handler.filenames.get(1));
assertEquals(TYPE_PDF, handler.mediaTypes.get(1));
}
use of org.apache.tika.extractor.ContainerExtractor in project tika by apache.
the class RTFParserTest method testEmbeddedLinkedDocument.
//TIKA-1010 test linked embedded doc
@Test
public void testEmbeddedLinkedDocument() throws Exception {
Set<MediaType> skipTypes = new HashSet<MediaType>();
skipTypes.add(MediaType.parse("image/emf"));
skipTypes.add(MediaType.parse("image/wmf"));
TrackingHandler tracker = new TrackingHandler(skipTypes);
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
ContainerExtractor ex = new ParserContainerExtractor();
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, tracker);
}
//should gracefully skip link and not throw NPE, IOEx, etc
assertEquals(0, tracker.filenames.size());
tracker = new TrackingHandler();
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
ContainerExtractor ex = new ParserContainerExtractor();
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, tracker);
}
//should gracefully skip link and not throw NPE, IOEx, etc
assertEquals(2, tracker.filenames.size());
}
Aggregations