use of org.apache.tika.extractor.ContainerExtractor in project tika by apache.
the class FictionBookParserTest method testEmbedded.
@Test
public void testEmbedded() throws Exception {
try (InputStream input = FictionBookParserTest.class.getResourceAsStream("/test-documents/test.fb2")) {
ContainerExtractor extractor = new ParserContainerExtractor();
TikaInputStream stream = TikaInputStream.get(input);
assertEquals(true, extractor.isSupported(stream));
// Process it
TrackingHandler handler = new TrackingHandler();
extractor.extract(stream, null, handler);
assertEquals(2, handler.filenames.size());
}
}
use of org.apache.tika.extractor.ContainerExtractor in project tika by apache.
the class POIContainerExtractionTest method testEmbeddedOfficeFilesXML.
@Test
public void testEmbeddedOfficeFilesXML() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
TrackingHandler handler;
handler = process("EmbeddedDocument.docx", extractor, false);
assertTrue(handler.filenames.contains("Microsoft_Office_Excel_97-2003_Worksheet1.bin"));
assertEquals(2, handler.filenames.size());
}
use of org.apache.tika.extractor.ContainerExtractor in project tika by apache.
the class TNEFParserTest method testBodyAndAttachments.
/**
* Check the Rtf and Attachments are returned
* as expected
*/
@Test
public void testBodyAndAttachments() throws Exception {
ContainerExtractor extractor = new ParserContainerExtractor();
// Process it with recursing
// Will have the message body RTF and the attachments
TrackingHandler handler = process(file, extractor, true);
assertEquals(6, handler.filenames.size());
assertEquals(6, handler.mediaTypes.size());
// We know the filenames for all of them
assertEquals("message.rtf", handler.filenames.get(0));
assertEquals(MediaType.application("rtf"), handler.mediaTypes.get(0));
assertEquals("quick.doc", handler.filenames.get(1));
assertEquals(MediaType.application("msword"), handler.mediaTypes.get(1));
assertEquals("quick.html", handler.filenames.get(2));
assertEquals(MediaType.text("html"), handler.mediaTypes.get(2));
assertEquals("quick.pdf", handler.filenames.get(3));
assertEquals(MediaType.application("pdf"), handler.mediaTypes.get(3));
assertEquals("quick.txt", handler.filenames.get(4));
assertEquals(MediaType.text("plain"), handler.mediaTypes.get(4));
assertEquals("quick.xml", handler.filenames.get(5));
assertEquals(MediaType.application("xml"), handler.mediaTypes.get(5));
}
use of org.apache.tika.extractor.ContainerExtractor in project tika by apache.
the class PDFParserTest method testEmbeddedPDFEmbeddingAnotherDocument.
//TIKA-1124
@Test
public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
/* format of test doc:
docx/
pdf/
docx
*/
String content = getXML("testPDFEmbeddingAndEmbedded.docx").xml;
int outerHaystack = content.indexOf("Outer_haystack");
int pdfHaystack = content.indexOf("pdf_haystack");
int needle = content.indexOf("Needle");
assertTrue(outerHaystack > -1);
assertTrue(pdfHaystack > -1);
assertTrue(needle > -1);
assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
TrackingHandler tracker = new TrackingHandler();
ContainerExtractor ex = new ParserContainerExtractor();
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"))) {
ex.extract(tis, ex, tracker);
}
assertEquals(3, tracker.filenames.size());
assertEquals(3, tracker.mediaTypes.size());
assertEquals("image1.emf", tracker.filenames.get(0));
assertNull(tracker.filenames.get(1));
assertEquals("Test.docx", tracker.filenames.get(2));
assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
}
use of org.apache.tika.extractor.ContainerExtractor in project tika by apache.
the class RFC822ParserTest method testGetAttachmentsAsEmbeddedResources.
/**
* TIKA-1222 When requested, ensure that the various attachments of
* the mail come through properly as embedded resources
*/
@Test
public void testGetAttachmentsAsEmbeddedResources() throws Exception {
TrackingHandler tracker = new TrackingHandler();
ContainerExtractor ex = new ParserContainerExtractor();
try (TikaInputStream tis = TikaInputStream.get(getStream("test-documents/testRFC822-multipart"))) {
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, tracker);
}
// Check we found all 3 parts
assertEquals(3, tracker.filenames.size());
assertEquals(3, tracker.mediaTypes.size());
// No filenames available
assertEquals(null, tracker.filenames.get(0));
assertEquals(null, tracker.filenames.get(1));
assertEquals(null, tracker.filenames.get(2));
// Types are available
assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
}
Aggregations