Search in sources :

Example 1 with POITextExtractor

use of org.apache.poi.POITextExtractor in project poi by apache.

the class OLE2ExtractorFactory method createExtractor.

/**
     * Create the Extractor, if possible. Generally needs the Scratchpad jar.
     * Note that this won't check for embedded OOXML resources either, use
     *  {@link org.apache.poi.extractor.ExtractorFactory} for that.
     */
public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException {
    // out from
    for (String workbookName : WORKBOOK_DIR_ENTRY_NAMES) {
        if (poifsDir.hasEntry(workbookName)) {
            if (getPreferEventExtractor()) {
                return new EventBasedExcelExtractor(poifsDir);
            }
            return new ExcelExtractor(poifsDir);
        }
    }
    if (poifsDir.hasEntry(OLD_WORKBOOK_DIR_ENTRY_NAME)) {
        throw new OldExcelFormatException("Old Excel Spreadsheet format (1-95) " + "found. Please call OldExcelExtractor directly for basic text extraction");
    }
    // Ask Scratchpad, or fail trying
    Class<?> cls = getScratchpadClass();
    try {
        Method m = cls.getDeclaredMethod("createExtractor", DirectoryNode.class);
        POITextExtractor ext = (POITextExtractor) m.invoke(null, poifsDir);
        if (ext != null)
            return ext;
    } catch (IllegalArgumentException iae) {
        throw iae;
    } catch (Exception e) {
        throw new IllegalArgumentException("Error creating Scratchpad Extractor", e);
    }
    throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
}
Also used : POITextExtractor(org.apache.poi.POITextExtractor) ExcelExtractor(org.apache.poi.hssf.extractor.ExcelExtractor) EventBasedExcelExtractor(org.apache.poi.hssf.extractor.EventBasedExcelExtractor) Method(java.lang.reflect.Method) OldExcelFormatException(org.apache.poi.hssf.OldExcelFormatException) IOException(java.io.IOException) OldExcelFormatException(org.apache.poi.hssf.OldExcelFormatException) EventBasedExcelExtractor(org.apache.poi.hssf.extractor.EventBasedExcelExtractor)

Example 2 with POITextExtractor

use of org.apache.poi.POITextExtractor in project poi by apache.

the class ExtractorFactory method getEmbededDocsTextExtractors.

/**
     * Returns an array of text extractors, one for each of
     *  the embedded documents in the file (if there are any).
     * If there are no embedded documents, you'll get back an
     *  empty array. Otherwise, you'll get one open
     *  {@link POITextExtractor} for each embedded file.
     */
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, OpenXML4JException, XmlException {
    // All the embedded directories we spotted
    ArrayList<Entry> dirs = new ArrayList<Entry>();
    // For anything else not directly held in as a POIFS directory
    ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
    // Find all the embedded directories
    DirectoryEntry root = ext.getRoot();
    if (root == null) {
        throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
    }
    if (ext instanceof ExcelExtractor) {
        // These are in MBD... under the root
        Iterator<Entry> it = root.getEntries();
        while (it.hasNext()) {
            Entry entry = it.next();
            if (entry.getName().startsWith("MBD")) {
                dirs.add(entry);
            }
        }
    } else if (ext instanceof WordExtractor) {
        // These are in ObjectPool -> _... under the root
        try {
            DirectoryEntry op = (DirectoryEntry) root.getEntry("ObjectPool");
            Iterator<Entry> it = op.getEntries();
            while (it.hasNext()) {
                Entry entry = it.next();
                if (entry.getName().startsWith("_")) {
                    dirs.add(entry);
                }
            }
        } catch (FileNotFoundException e) {
            logger.log(POILogger.INFO, "Ignoring FileNotFoundException while extracting Word document", e.getLocalizedMessage());
        // ignored here
        }
    //} else if(ext instanceof PowerPointExtractor) {
    // Tricky, not stored directly in poifs
    // TODO
    } else if (ext instanceof OutlookTextExtactor) {
        // Stored in the Attachment blocks
        MAPIMessage msg = ((OutlookTextExtactor) ext).getMAPIMessage();
        for (AttachmentChunks attachment : msg.getAttachmentFiles()) {
            if (attachment.getAttachData() != null) {
                byte[] data = attachment.getAttachData().getValue();
                nonPOIFS.add(new ByteArrayInputStream(data));
            } else if (attachment.getAttachmentDirectory() != null) {
                dirs.add(attachment.getAttachmentDirectory().getDirectory());
            }
        }
    }
    // Create the extractors
    if (dirs.size() == 0 && nonPOIFS.size() == 0) {
        return new POITextExtractor[0];
    }
    ArrayList<POITextExtractor> textExtractors = new ArrayList<POITextExtractor>();
    for (Entry dir : dirs) {
        textExtractors.add(createExtractor((DirectoryNode) dir));
    }
    for (InputStream nonPOIF : nonPOIFS) {
        try {
            textExtractors.add(createExtractor(nonPOIF));
        } catch (IllegalArgumentException e) {
            // Ignore, just means it didn't contain
            //  a format we support as yet
            logger.log(POILogger.INFO, "Format not supported yet", e.getLocalizedMessage());
        } catch (XmlException e) {
            throw new IOException(e.getMessage(), e);
        } catch (OpenXML4JException e) {
            throw new IOException(e.getMessage(), e);
        }
    }
    return textExtractors.toArray(new POITextExtractor[textExtractors.size()]);
}
Also used : PushbackInputStream(java.io.PushbackInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) FileNotFoundException(java.io.FileNotFoundException) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) IOException(java.io.IOException) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) WordExtractor(org.apache.poi.hwpf.extractor.WordExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) MAPIMessage(org.apache.poi.hsmf.MAPIMessage) Entry(org.apache.poi.poifs.filesystem.Entry) DirectoryEntry(org.apache.poi.poifs.filesystem.DirectoryEntry) OutlookTextExtactor(org.apache.poi.hsmf.extractor.OutlookTextExtactor) OpenXML4JException(org.apache.poi.openxml4j.exceptions.OpenXML4JException) ByteArrayInputStream(java.io.ByteArrayInputStream) POITextExtractor(org.apache.poi.POITextExtractor) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) ExcelExtractor(org.apache.poi.hssf.extractor.ExcelExtractor) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) XSSFBEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFBEventBasedExcelExtractor) XmlException(org.apache.xmlbeans.XmlException) Iterator(java.util.Iterator) AttachmentChunks(org.apache.poi.hsmf.datatypes.AttachmentChunks)

Example 3 with POITextExtractor

use of org.apache.poi.POITextExtractor in project poi by apache.

the class CommandLineTextExtractor method main.

public static void main(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Use:");
        System.err.println("   CommandLineTextExtractor <filename> [filename] [filename]");
        System.exit(1);
    }
    for (String arg : args) {
        System.out.println(DIVIDER);
        File f = new File(arg);
        System.out.println(f);
        POITextExtractor extractor = ExtractorFactory.createExtractor(f);
        try {
            POITextExtractor metadataExtractor = extractor.getMetadataTextExtractor();
            System.out.println("   " + DIVIDER);
            String metaData = metadataExtractor.getText();
            System.out.println(metaData);
            System.out.println("   " + DIVIDER);
            String text = extractor.getText();
            System.out.println(text);
            System.out.println(DIVIDER);
            System.out.println("Had " + metaData.length() + " characters of metadata and " + text.length() + " characters of text");
        } finally {
            extractor.close();
        }
    }
}
Also used : POITextExtractor(org.apache.poi.POITextExtractor) File(java.io.File)

Example 4 with POITextExtractor

use of org.apache.poi.POITextExtractor in project poi by apache.

the class TestExtractorFactory method testInputStream.

@Test
public void testInputStream() throws Exception {
    // Excel
    POITextExtractor extractor = ExtractorFactory.createExtractor(new FileInputStream(xls));
    assertTrue(extractor instanceof ExcelExtractor);
    assertTrue(extractor.getText().length() > 200);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(new FileInputStream(xlsx));
    assertTrue(extractor.getClass().getName(), extractor instanceof XSSFExcelExtractor);
    assertTrue(extractor.getText().length() > 200);
    // TODO Support OOXML-Strict, see bug #57699
    //        assertTrue(
    //                ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict))
    //                instanceof XSSFExcelExtractor
    //        );
    //        assertTrue(
    //                ExtractorFactory.createExtractor(new FileInputStream(xlsxStrict)).getText().length() > 200
    //        );
    extractor.close();
    // Word
    extractor = ExtractorFactory.createExtractor(new FileInputStream(doc));
    assertTrue(extractor.getClass().getName(), extractor instanceof WordExtractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(new FileInputStream(doc6));
    assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
    assertTrue(extractor.getText().length() > 20);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(new FileInputStream(doc95));
    assertTrue(extractor.getClass().getName(), extractor instanceof Word6Extractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(new FileInputStream(docx));
    assertTrue(extractor instanceof XWPFWordExtractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    // PowerPoint
    extractor = ExtractorFactory.createExtractor(new FileInputStream(ppt));
    assertTrue(extractor instanceof PowerPointExtractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    extractor = ExtractorFactory.createExtractor(new FileInputStream(pptx));
    assertTrue(extractor instanceof XSLFPowerPointExtractor);
    assertTrue(extractor.getText().length() > 120);
    extractor.close();
    // Visio
    extractor = ExtractorFactory.createExtractor(new FileInputStream(vsd));
    assertTrue(extractor instanceof VisioTextExtractor);
    assertTrue(extractor.getText().length() > 50);
    extractor.close();
    // Visio - vsdx
    extractor = ExtractorFactory.createExtractor(new FileInputStream(vsdx));
    assertTrue(extractor instanceof XDGFVisioExtractor);
    assertTrue(extractor.getText().length() > 20);
    extractor.close();
    // Publisher
    extractor = ExtractorFactory.createExtractor(new FileInputStream(pub));
    assertTrue(extractor instanceof PublisherTextExtractor);
    assertTrue(extractor.getText().length() > 50);
    extractor.close();
    // Outlook msg
    extractor = ExtractorFactory.createExtractor(new FileInputStream(msg));
    assertTrue(extractor instanceof OutlookTextExtactor);
    assertTrue(extractor.getText().length() > 50);
    extractor.close();
    // Text
    try {
        FileInputStream stream = new FileInputStream(txt);
        try {
            ExtractorFactory.createExtractor(stream);
            fail();
        } finally {
            IOUtils.closeQuietly(stream);
        }
    } catch (IllegalArgumentException e) {
    // Good
    }
}
Also used : XDGFVisioExtractor(org.apache.poi.xdgf.extractor.XDGFVisioExtractor) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) Word6Extractor(org.apache.poi.hwpf.extractor.Word6Extractor) PowerPointExtractor(org.apache.poi.hslf.extractor.PowerPointExtractor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) PublisherTextExtractor(org.apache.poi.hpbf.extractor.PublisherTextExtractor) FileInputStream(java.io.FileInputStream) WordExtractor(org.apache.poi.hwpf.extractor.WordExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) OutlookTextExtactor(org.apache.poi.hsmf.extractor.OutlookTextExtactor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) POITextExtractor(org.apache.poi.POITextExtractor) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) ExcelExtractor(org.apache.poi.hssf.extractor.ExcelExtractor) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) EventBasedExcelExtractor(org.apache.poi.hssf.extractor.EventBasedExcelExtractor) VisioTextExtractor(org.apache.poi.hdgf.extractor.VisioTextExtractor) Test(org.junit.Test)

Example 5 with POITextExtractor

use of org.apache.poi.POITextExtractor in project poi by apache.

the class TestExtractorFactory method testEmbeded.

/**
     * Test embeded docs text extraction. For now, only
     *  does poifs embeded, but will do ooxml ones 
     *  at some point.
     */
@Test
public void testEmbeded() throws Exception {
    POIOLE2TextExtractor ext;
    POITextExtractor[] embeds;
    // No embedings
    ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xls);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    assertEquals(0, embeds.length);
    ext.close();
    // Excel
    ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(xlsEmb);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    assertEquals(6, embeds.length);
    int numWord = 0, numXls = 0, numPpt = 0, numMsg = 0, numWordX;
    for (POITextExtractor embed : embeds) {
        assertTrue(embed.getText().length() > 20);
        if (embed instanceof PowerPointExtractor)
            numPpt++;
        else if (embed instanceof ExcelExtractor)
            numXls++;
        else if (embed instanceof WordExtractor)
            numWord++;
        else if (embed instanceof OutlookTextExtactor)
            numMsg++;
    }
    assertEquals(2, numPpt);
    assertEquals(2, numXls);
    assertEquals(2, numWord);
    assertEquals(0, numMsg);
    ext.close();
    // Word
    ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmb);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    numWord = 0;
    numXls = 0;
    numPpt = 0;
    numMsg = 0;
    assertEquals(4, embeds.length);
    for (POITextExtractor embed : embeds) {
        assertTrue(embed.getText().length() > 20);
        if (embed instanceof PowerPointExtractor)
            numPpt++;
        else if (embed instanceof ExcelExtractor)
            numXls++;
        else if (embed instanceof WordExtractor)
            numWord++;
        else if (embed instanceof OutlookTextExtactor)
            numMsg++;
    }
    assertEquals(1, numPpt);
    assertEquals(2, numXls);
    assertEquals(1, numWord);
    assertEquals(0, numMsg);
    ext.close();
    // Word which contains an OOXML file
    ext = (POIOLE2TextExtractor) ExtractorFactory.createExtractor(docEmbOOXML);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    numWord = 0;
    numXls = 0;
    numPpt = 0;
    numMsg = 0;
    numWordX = 0;
    assertEquals(3, embeds.length);
    for (POITextExtractor embed : embeds) {
        assertTrue(embed.getText().length() > 20);
        if (embed instanceof PowerPointExtractor)
            numPpt++;
        else if (embed instanceof ExcelExtractor)
            numXls++;
        else if (embed instanceof WordExtractor)
            numWord++;
        else if (embed instanceof OutlookTextExtactor)
            numMsg++;
        else if (embed instanceof XWPFWordExtractor)
            numWordX++;
    }
    assertEquals(1, numPpt);
    assertEquals(1, numXls);
    assertEquals(0, numWord);
    assertEquals(1, numWordX);
    assertEquals(0, numMsg);
    ext.close();
    // Outlook
    ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmb);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    numWord = 0;
    numXls = 0;
    numPpt = 0;
    numMsg = 0;
    assertEquals(1, embeds.length);
    for (POITextExtractor embed : embeds) {
        assertTrue(embed.getText().length() > 20);
        if (embed instanceof PowerPointExtractor)
            numPpt++;
        else if (embed instanceof ExcelExtractor)
            numXls++;
        else if (embed instanceof WordExtractor)
            numWord++;
        else if (embed instanceof OutlookTextExtactor)
            numMsg++;
    }
    assertEquals(0, numPpt);
    assertEquals(0, numXls);
    assertEquals(1, numWord);
    assertEquals(0, numMsg);
    ext.close();
    // Outlook with another outlook file in it
    ext = (OutlookTextExtactor) ExtractorFactory.createExtractor(msgEmbMsg);
    embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
    numWord = 0;
    numXls = 0;
    numPpt = 0;
    numMsg = 0;
    assertEquals(1, embeds.length);
    for (POITextExtractor embed : embeds) {
        assertTrue(embed.getText().length() > 20);
        if (embed instanceof PowerPointExtractor)
            numPpt++;
        else if (embed instanceof ExcelExtractor)
            numXls++;
        else if (embed instanceof WordExtractor)
            numWord++;
        else if (embed instanceof OutlookTextExtactor)
            numMsg++;
    }
    assertEquals(0, numPpt);
    assertEquals(0, numXls);
    assertEquals(0, numWord);
    assertEquals(1, numMsg);
    ext.close();
// TODO - PowerPoint
// TODO - Publisher
// TODO - Visio
}
Also used : OutlookTextExtactor(org.apache.poi.hsmf.extractor.OutlookTextExtactor) POITextExtractor(org.apache.poi.POITextExtractor) PowerPointExtractor(org.apache.poi.hslf.extractor.PowerPointExtractor) XSLFPowerPointExtractor(org.apache.poi.xslf.extractor.XSLFPowerPointExtractor) XSSFExcelExtractor(org.apache.poi.xssf.extractor.XSSFExcelExtractor) ExcelExtractor(org.apache.poi.hssf.extractor.ExcelExtractor) XSSFEventBasedExcelExtractor(org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor) EventBasedExcelExtractor(org.apache.poi.hssf.extractor.EventBasedExcelExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) POIOLE2TextExtractor(org.apache.poi.POIOLE2TextExtractor) WordExtractor(org.apache.poi.hwpf.extractor.WordExtractor) XWPFWordExtractor(org.apache.poi.xwpf.extractor.XWPFWordExtractor) Test(org.junit.Test)

Aggregations

POITextExtractor (org.apache.poi.POITextExtractor)18 Test (org.junit.Test)11 ExcelExtractor (org.apache.poi.hssf.extractor.ExcelExtractor)9 EventBasedExcelExtractor (org.apache.poi.hssf.extractor.EventBasedExcelExtractor)6 XSSFEventBasedExcelExtractor (org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor)5 XSSFExcelExtractor (org.apache.poi.xssf.extractor.XSSFExcelExtractor)5 FileInputStream (java.io.FileInputStream)4 InputStream (java.io.InputStream)4 OutlookTextExtactor (org.apache.poi.hsmf.extractor.OutlookTextExtactor)4 WordExtractor (org.apache.poi.hwpf.extractor.WordExtractor)4 XWPFWordExtractor (org.apache.poi.xwpf.extractor.XWPFWordExtractor)4 IOException (java.io.IOException)3 PowerPointExtractor (org.apache.poi.hslf.extractor.PowerPointExtractor)3 XSLFPowerPointExtractor (org.apache.poi.xslf.extractor.XSLFPowerPointExtractor)3 Method (java.lang.reflect.Method)2 ArrayList (java.util.ArrayList)2 Matcher (java.util.regex.Matcher)2 Pattern (java.util.regex.Pattern)2 POIOLE2TextExtractor (org.apache.poi.POIOLE2TextExtractor)2 POIXMLException (org.apache.poi.POIXMLException)2