Search in sources :

Example 6 with POITextExtractor

use of org.apache.poi.POITextExtractor in project poi by apache.

the class TestExtractorFactory method testEmbeded.

/**
     * Test embeded docs text extraction. For now, only
     *  does poifs embeded, but will do ooxml ones 
     *  at some point.
     */
@Test
public void testEmbeded() throws Exception {
    POIOLE2TextExtractor ext;
    POITextExtractor[] embeds;
    // No embedings
    ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xls);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    assertEquals(0, embeds.length);
    ext.close();
    // Excel
    ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xlsEmb);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    assertEquals(6, embeds.length);
    int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
    for (POITextExtractor embed : embeds) {
        assertTrue(embed.getText().length() > 20);
        if (embed instanceof PowerPointExtractor)
            numPpt++;
        else if (embed instanceof ExcelExtractor)
            numXls++;
        else if (embed instanceof WordExtractor)
            numWord++;
        else if (embed instanceof OutlookTextExtactor)
            numMsg++;
    }
    assertEquals(2, numPpt);
    assertEquals(2, numXls);
    assertEquals(2, numWord);
    assertEquals(0, numMsg);
    ext.close();
    // Word
    ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmb);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    numWord = 0;
    numXls = 0;
    numPpt = 0;
    numMsg = 0;
    assertEquals(4, embeds.length);
    for (POITextExtractor embed : embeds) {
        assertTrue(embed.getText().length() > 20);
        if (embed instanceof PowerPointExtractor)
            numPpt++;
        else if (embed instanceof ExcelExtractor)
            numXls++;
        else if (embed instanceof WordExtractor)
            numWord++;
        else if (embed instanceof OutlookTextExtactor)
            numMsg++;
    }
    assertEquals(1, numPpt);
    assertEquals(2, numXls);
    assertEquals(1, numWord);
    assertEquals(0, numMsg);
    ext.close();
    // Word which contains an OOXML file
    ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmbOOXML);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    numWord = 0;
    numXls = 0;
    numPpt = 0;
    numMsg = 0;
    numWordX = 0;
    assertEquals(3, embeds.length);
    for (POITextExtractor embed : embeds) {
        assertTrue(embed.getText().length() > 20);
        if (embed instanceof PowerPointExtractor)
            numPpt++;
        else if (embed instanceof ExcelExtractor)
            numXls++;
        else if (embed instanceof WordExtractor)
            numWord++;
        else if (embed instanceof OutlookTextExtactor)
            numMsg++;
        else if (embed instanceof XWPFWordExtractor)
            numWordX++;
    }
    assertEquals(1, numPpt);
    assertEquals(1, numXls);
    assertEquals(0, numWord);
    assertEquals(1, numWordX);
    assertEquals(0, numMsg);
    ext.close();
    // Outlook
    ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmb);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    numWord = 0;
    numXls = 0;
    numPpt = 0;
    numMsg = 0;
    assertEquals(1, embeds.length);
    for (POITextExtractor embed : embeds) {
        assertTrue(embed.getText().length() > 20);
        if (embed instanceof PowerPointExtractor)
            numPpt++;
        else if (embed instanceof ExcelExtractor)
            numXls++;
        else if (embed instanceof WordExtractor)
            numWord++;
        else if (embed instanceof OutlookTextExtactor)
            numMsg++;
    }
    assertEquals(0, numPpt);
    assertEquals(0, numXls);
    assertEquals(1, numWord);
    assertEquals(0, numMsg);
    ext.close();
    // Outlook with another outlook file in it
    ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmbMsg);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    numWord = 0;
    numXls = 0;
    numPpt = 0;
    numMsg = 0;
    assertEquals(1, embeds.length);
    for (POITextExtractor embed : embeds) {
        assertTrue(embed.getText().length() > 20);
        if (embed instanceof PowerPointExtractor)
            numPpt++;
        else if (embed instanceof ExcelExtractor)
            numXls++;
        else if (embed instanceof WordExtractor)
            numWord++;
        else if (embed instanceof OutlookTextExtactor)
            numMsg++;
    }
    assertEquals(0, numPpt);
    assertEquals(0, numXls);
    assertEquals(0, numWord);
    assertEquals(1, numMsg);
    ext.close();
// TODO - PowerPoint
// TODO - Publisher
// TODO - Visio
}
Also used : OutlookTextExtactor(org.apache.poi.hsmf.extractor.OutlookTextExtactor) POITextExtractor(org.apache.poi.POITextExtractor) PowerPointExtractor(org.apache.poi.hslf.extractor.PowerPointExtractor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) ExcelExtractor(org.apache.poi.hssf.extractor.ExcelExtractor) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) EventBasedExcelExtractor(org.apache.poi.hssf.extractor.EventBasedExcelExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) POIOLE2TextExtractor(org.apache.poi.POIOLE2TextExtractor) WordExtractor(org.apache.poi.hwpf.extractor.WordExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) Test(org.junit.Test)

Example 7 with POITextExtractor

use of org.apache.poi.POITextExtractor in project poi by apache.

the class TestXSLFPowerPointExtractor method test45541.

@Test
public void test45541() throws Exception {
    // extract text from a powerpoint that has a header in the notes-element
    POITextExtractor extr = ExtractorFactory.createExtractor(slTests.getFile("45541_Header.pptx"));
    String text = extr.getText();
    assertNotNull(text);
    assertFalse("Had: " + text, text.contains("testdoc"));
    text = ((XSLFPowerPointExtractor) extr).getText(false, true);
    assertContains(text, "testdoc");
    extr.close();
    assertNotNull(text);
    // extract text from a powerpoint that has a footer in the master-slide
    extr = ExtractorFactory.createExtractor(slTests.getFile("45541_Footer.pptx"));
    text = extr.getText();
    assertNotContained(text, "testdoc");
    text = ((XSLFPowerPointExtractor) extr).getText(false, true);
    assertNotContained(text, "testdoc");
    text = ((XSLFPowerPointExtractor) extr).getText(false, false, true);
    assertNotContained(text, "testdoc");
    extr.close();
}
Also used : POITextExtractor(org.apache.poi.POITextExtractor) Test(org.junit.Test)

Example 8 with POITextExtractor

use of org.apache.poi.POITextExtractor in project poi by apache.

the class TestZipPackage method testZipEntityExpansionSharedStringTableEvents.

@Test
public void testZipEntityExpansionSharedStringTableEvents() throws Exception {
    boolean before = ExtractorFactory.getThreadPrefersEventExtractors();
    ExtractorFactory.setThreadPrefersEventExtractors(true);
    try {
        POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("poc-shared-strings.xlsx"));
        try {
            assertNotNull(extractor);
            try {
                extractor.getText();
            } catch (IllegalStateException e) {
            // expected due to shared strings expansion
            }
        } finally {
            extractor.close();
        }
    } catch (XmlException e) {
        assertEntityLimitReached(e);
    } finally {
        ExtractorFactory.setThreadPrefersEventExtractors(before);
    }
}
Also used : POITextExtractor(org.apache.poi.POITextExtractor) XmlException(org.apache.xmlbeans.XmlException) Test(org.junit.Test)

Example 9 with POITextExtractor

use of org.apache.poi.POITextExtractor in project poi by apache.

the class TestZipPackage method testZipEntityExpansionExceedsMemory.

@Test
public void testZipEntityExpansionExceedsMemory() throws Exception {
    try {
        Workbook wb = WorkbookFactory.create(XSSFTestDataSamples.openSamplePackage("poc-xmlbomb.xlsx"));
        wb.close();
        fail("Should catch exception due to entity expansion limitations");
    } catch (POIXMLException e) {
        assertEntityLimitReached(e);
    }
    try {
        POITextExtractor extractor = ExtractorFactory.createExtractor(HSSFTestDataSamples.getSampleFile("poc-xmlbomb.xlsx"));
        try {
            assertNotNull(extractor);
            try {
                extractor.getText();
            } catch (IllegalStateException e) {
            // expected due to shared strings expansion
            }
        } finally {
            extractor.close();
        }
    } catch (POIXMLException e) {
        assertEntityLimitReached(e);
    }
}
Also used : POITextExtractor(org.apache.poi.POITextExtractor) POIXMLException(org.apache.poi.POIXMLException) Workbook(org.apache.poi.ss.usermodel.Workbook) Test(org.junit.Test)

Example 10 with POITextExtractor

use of org.apache.poi.POITextExtractor in project poi by apache.

the class TestWordExtractor method testExtractorFromWord6Extractor.

@Test
public void testExtractorFromWord6Extractor() throws Exception {
    InputStream is = POIDataSamples.getHPSFInstance().openResourceAsStream("TestMickey.doc");
    POIFSFileSystem fs = new POIFSFileSystem(is);
    is.close();
    Word6Extractor wExt = new Word6Extractor(fs);
    try {
        POITextExtractor ext = wExt.getMetadataTextExtractor();
        try {
            // Now overall
            String text = ext.getText();
            assertContains(text, "TEMPLATE = Normal");
            assertContains(text, "SUBJECT = sample subject");
            assertContains(text, "MANAGER = sample manager");
            assertContains(text, "COMPANY = sample company");
        } finally {
            ext.close();
        }
    } finally {
        wExt.close();
        fs.close();
    }
}
Also used : POITextExtractor(org.apache.poi.POITextExtractor) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) OPOIFSFileSystem(org.apache.poi.poifs.filesystem.OPOIFSFileSystem) POIFSFileSystem(org.apache.poi.poifs.filesystem.POIFSFileSystem) NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) Test(org.junit.Test)

Aggregations

POITextExtractor (org.apache.poi.POITextExtractor)18 Test (org.junit.Test)11 ExcelExtractor (org.apache.poi.hssf.extractor.ExcelExtractor)9 EventBasedExcelExtractor (org.apache.poi.hssf.extractor.EventBasedExcelExtractor)6 XSSFEventBasedExcelExtractor (org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor)5 XSSFExcelExtractor (org.apache.poi.xssf.extractor.XSSFExcelExtractor)5 FileInputStream (java.io.FileInputStream)4 InputStream (java.io.InputStream)4 OutlookTextExtactor (org.apache.poi.hsmf.extractor.OutlookTextExtactor)4 WordExtractor (org.apache.poi.hwpf.extractor.WordExtractor)4 XWPFWordExtractor (org.apache.poi.xwpf.extractor.XWPFWordExtractor)4 IOException (java.io.IOException)3 PowerPointExtractor (org.apache.poi.hslf.extractor.PowerPointExtractor)3 XSLFPowerPointExtractor (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor)3 Method (java.lang.reflect.Method)2 ArrayList (java.util.ArrayList)2 Matcher (java.util.regex.Matcher)2 Pattern (java.util.regex.Pattern)2 POIOLE2TextExtractor (org.apache.poi.POIOLE2TextExtractor)2 POIXMLException (org.apache.poi.POIXMLException)2