use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.
the class PDFParserTest method testEmbeddedPDFEmbeddingAnotherDocument.
//TIKA-1124
@Test
public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
/* format of test doc:
docx/
pdf/
docx
*/
String content = getXML("testPDFEmbeddingAndEmbedded.docx").xml;
int outerHaystack = content.indexOf("Outer_haystack");
int pdfHaystack = content.indexOf("pdf_haystack");
int needle = content.indexOf("Needle");
assertTrue(outerHaystack > -1);
assertTrue(pdfHaystack > -1);
assertTrue(needle > -1);
assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
TrackingHandler tracker = new TrackingHandler();
ContainerExtractor ex = new ParserContainerExtractor();
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"))) {
ex.extract(tis, ex, tracker);
}
assertEquals(3, tracker.filenames.size());
assertEquals(3, tracker.mediaTypes.size());
assertEquals("image1.emf", tracker.filenames.get(0));
assertNull(tracker.filenames.get(1));
assertEquals("Test.docx", tracker.filenames.get(2));
assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
}
use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.
the class OOXMLContainerExtractionTest method setUp.
@Before
public void setUp() {
Tika tika = new Tika();
extractor = new ParserContainerExtractor(tika.getParser(), tika.getDetector());
}
use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.
the class RFC822ParserTest method testGetAttachmentsAsEmbeddedResources.
/**
* TIKA-1222 When requested, ensure that the various attachments of
* the mail come through properly as embedded resources
*/
@Test
public void testGetAttachmentsAsEmbeddedResources() throws Exception {
TrackingHandler tracker = new TrackingHandler();
ContainerExtractor ex = new ParserContainerExtractor();
try (TikaInputStream tis = TikaInputStream.get(getStream("test-documents/testRFC822-multipart"))) {
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, tracker);
}
// Check we found all 3 parts
assertEquals(3, tracker.filenames.size());
assertEquals(3, tracker.mediaTypes.size());
// No filenames available
assertEquals(null, tracker.filenames.get(0));
assertEquals(null, tracker.filenames.get(1));
assertEquals(null, tracker.filenames.get(2));
// Types are available
assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
}
use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.
the class SQLite3ParserTest method testParserContainerExtractor.
@Test
public void testParserContainerExtractor() throws Exception {
//There should be 6 embedded documents:
//2x tables -- UTF-8 csv representations of the tables
//2x word files, one doc and one docx
//2x png files, the same image embedded in each of the doc and docx
ParserContainerExtractor ex = new ParserContainerExtractor();
ByteCopyingHandler byteCopier = new ByteCopyingHandler();
Metadata metadata = new Metadata();
try (TikaInputStream is = TikaInputStream.get(getResourceAsStream(TEST_FILE1))) {
metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
ex.extract(is, ex, byteCopier);
}
assertEquals(4, byteCopier.bytes.size());
String[] strings = new String[4];
for (int i = 1; i < byteCopier.bytes.size(); i++) {
byte[] byteArr = byteCopier.bytes.get(i);
String s = new String(byteArr, 0, Math.min(byteArr.length, 1000), UTF_8);
strings[i] = s;
}
byte[] oleBytes = new byte[] { (byte) -48, (byte) -49, (byte) 17, (byte) -32, (byte) -95, (byte) -79, (byte) 26, (byte) -31, (byte) 0, (byte) 0 };
//test OLE
for (int i = 0; i < 10; i++) {
assertEquals(oleBytes[i], byteCopier.bytes.get(0)[i]);
}
assertContains("PNG", strings[1]);
assertContains("PK", strings[2]);
assertContains("PNG", strings[3]);
}
use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.
the class POIContainerExtractionTest method testWithoutEmbedded.
/**
* For office files which don't have anything embedded in them
*/
@Test
public void testWithoutEmbedded() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
String[] files = new String[] { "testEXCEL.xls", "testWORD.doc", "testPPT.ppt", "testVISIO.vsd", "test-outlook.msg" };
for (String file : files) {
// Process it without recursing
TrackingHandler handler = process(file, extractor, false);
// Won't have fired
assertEquals(0, handler.filenames.size());
assertEquals(0, handler.mediaTypes.size());
// Ditto with recursing
handler = process(file, extractor, true);
assertEquals(0, handler.filenames.size());
assertEquals(0, handler.mediaTypes.size());
}
}
Aggregations