Search in sources :

Example 6 with ParserContainerExtractor

use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.

the class PDFParserTest method testEmbeddedPDFEmbeddingAnotherDocument.

//TIKA-1124
@Test
public void testEmbeddedPDFEmbeddingAnotherDocument() throws Exception {
    /* format of test doc:
         docx/
            pdf/
               docx
       */
    String content = getXML("testPDFEmbeddingAndEmbedded.docx").xml;
    int outerHaystack = content.indexOf("Outer_haystack");
    int pdfHaystack = content.indexOf("pdf_haystack");
    int needle = content.indexOf("Needle");
    assertTrue(outerHaystack > -1);
    assertTrue(pdfHaystack > -1);
    assertTrue(needle > -1);
    assertTrue(needle > pdfHaystack && pdfHaystack > outerHaystack);
    TrackingHandler tracker = new TrackingHandler();
    ContainerExtractor ex = new ParserContainerExtractor();
    try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testPDFEmbeddingAndEmbedded.docx"))) {
        ex.extract(tis, ex, tracker);
    }
    assertEquals(3, tracker.filenames.size());
    assertEquals(3, tracker.mediaTypes.size());
    assertEquals("image1.emf", tracker.filenames.get(0));
    assertNull(tracker.filenames.get(1));
    assertEquals("Test.docx", tracker.filenames.get(2));
    assertEquals(TYPE_EMF, tracker.mediaTypes.get(0));
    assertEquals(TYPE_PDF, tracker.mediaTypes.get(1));
    assertEquals(TYPE_DOCX, tracker.mediaTypes.get(2));
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) ContainerExtractor(org.apache.tika.extractor.ContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 7 with ParserContainerExtractor

use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.

the class OOXMLContainerExtractionTest method setUp.

@Before
public void setUp() {
    Tika tika = new Tika();
    extractor = new ParserContainerExtractor(tika.getParser(), tika.getDetector());
}
Also used : Tika(org.apache.tika.Tika) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) Before(org.junit.Before)

Example 8 with ParserContainerExtractor

use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.

the class RFC822ParserTest method testGetAttachmentsAsEmbeddedResources.

/**
     * TIKA-1222 When requested, ensure that the various attachments of
     * the mail come through properly as embedded resources
     */
@Test
public void testGetAttachmentsAsEmbeddedResources() throws Exception {
    TrackingHandler tracker = new TrackingHandler();
    ContainerExtractor ex = new ParserContainerExtractor();
    try (TikaInputStream tis = TikaInputStream.get(getStream("test-documents/testRFC822-multipart"))) {
        assertEquals(true, ex.isSupported(tis));
        ex.extract(tis, ex, tracker);
    }
    // Check we found all 3 parts
    assertEquals(3, tracker.filenames.size());
    assertEquals(3, tracker.mediaTypes.size());
    // No filenames available
    assertEquals(null, tracker.filenames.get(0));
    assertEquals(null, tracker.filenames.get(1));
    assertEquals(null, tracker.filenames.get(2));
    // Types are available
    assertEquals(MediaType.TEXT_PLAIN, tracker.mediaTypes.get(0));
    assertEquals(MediaType.TEXT_HTML, tracker.mediaTypes.get(1));
    assertEquals(MediaType.image("gif"), tracker.mediaTypes.get(2));
}
Also used : TikaInputStream(org.apache.tika.io.TikaInputStream) ContainerExtractor(org.apache.tika.extractor.ContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) TesseractOCRParserTest(org.apache.tika.parser.ocr.TesseractOCRParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 9 with ParserContainerExtractor

use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.

the class SQLite3ParserTest method testParserContainerExtractor.

@Test
public void testParserContainerExtractor() throws Exception {
    //There should be 6 embedded documents:
    //2x tables -- UTF-8 csv representations of the tables
    //2x word files, one doc and one docx
    //2x png files, the same image embedded in each of the doc and docx
    ParserContainerExtractor ex = new ParserContainerExtractor();
    ByteCopyingHandler byteCopier = new ByteCopyingHandler();
    Metadata metadata = new Metadata();
    try (TikaInputStream is = TikaInputStream.get(getResourceAsStream(TEST_FILE1))) {
        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
        ex.extract(is, ex, byteCopier);
    }
    assertEquals(4, byteCopier.bytes.size());
    String[] strings = new String[4];
    for (int i = 1; i < byteCopier.bytes.size(); i++) {
        byte[] byteArr = byteCopier.bytes.get(i);
        String s = new String(byteArr, 0, Math.min(byteArr.length, 1000), UTF_8);
        strings[i] = s;
    }
    byte[] oleBytes = new byte[] { (byte) -48, (byte) -49, (byte) 17, (byte) -32, (byte) -95, (byte) -79, (byte) 26, (byte) -31, (byte) 0, (byte) 0 };
    //test OLE
    for (int i = 0; i < 10; i++) {
        assertEquals(oleBytes[i], byteCopier.bytes.get(0)[i]);
    }
    assertContains("PNG", strings[1]);
    assertContains("PK", strings[2]);
    assertContains("PNG", strings[3]);
}
Also used : Metadata(org.apache.tika.metadata.Metadata) TikaInputStream(org.apache.tika.io.TikaInputStream) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 10 with ParserContainerExtractor

use of org.apache.tika.extractor.ParserContainerExtractor in project tika by apache.

the class POIContainerExtractionTest method testWithoutEmbedded.

/**
     * For office files which don't have anything embedded in them
     */
@Test
public void testWithoutEmbedded() throws Exception {
    ContainerExtractor extractor = new ParserContainerExtractor();
    String[] files = new String[] { "testEXCEL.xls", "testWORD.doc", "testPPT.ppt", "testVISIO.vsd", "test-outlook.msg" };
    for (String file : files) {
        // Process it without recursing
        TrackingHandler handler = process(file, extractor, false);
        // Won't have fired
        assertEquals(0, handler.filenames.size());
        assertEquals(0, handler.mediaTypes.size());
        // Ditto with recursing
        handler = process(file, extractor, true);
        assertEquals(0, handler.filenames.size());
        assertEquals(0, handler.mediaTypes.size());
    }
}
Also used : ContainerExtractor(org.apache.tika.extractor.ContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) ParserContainerExtractor(org.apache.tika.extractor.ParserContainerExtractor) Test(org.junit.Test)

Aggregations

ParserContainerExtractor (org.apache.tika.extractor.ParserContainerExtractor)15 Test (org.junit.Test)13 ContainerExtractor (org.apache.tika.extractor.ContainerExtractor)11 TikaInputStream (org.apache.tika.io.TikaInputStream)8 TikaTest (org.apache.tika.TikaTest)6 InputStream (java.io.InputStream)2 TrackingHandler (org.apache.tika.TikaTest.TrackingHandler)2 Metadata (org.apache.tika.metadata.Metadata)2 MediaType (org.apache.tika.mime.MediaType)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 Tika (org.apache.tika.Tika)1 MimeTypeException (org.apache.tika.mime.MimeTypeException)1 TesseractOCRParserTest (org.apache.tika.parser.ocr.TesseractOCRParserTest)1 Before (org.junit.Before)1 ConvertedDocument (org.opensextant.xtext.ConvertedDocument)1