Search in sources :

Example 16 with DirectoryNode

use of org.apache.poi.poifs.filesystem.DirectoryNode in project poi by apache.

the class WordToTextConverter method processOle2.

@Override
protected boolean processOle2(HWPFDocument wordDocument, Element block, Entry entry) throws Exception {
    if (!(entry instanceof DirectoryNode))
        return false;
    DirectoryNode directoryNode = (DirectoryNode) entry;
    /*
         * even if there is no ExtractorFactory in classpath, still support
         * included Word's objects
         */
    if (directoryNode.hasEntry("WordDocument")) {
        String text = WordToTextConverter.getText((DirectoryNode) entry);
        block.appendChild(textDocumentFacade.createText(UNICODECHAR_ZERO_WIDTH_SPACE + text + UNICODECHAR_ZERO_WIDTH_SPACE));
        return true;
    }
    Object extractor;
    try {
        Class<?> cls = Class.forName("org.apache.poi.extractor.ExtractorFactory");
        Method createExtractor = cls.getMethod("createExtractor", DirectoryNode.class);
        extractor = createExtractor.invoke(null, directoryNode);
    } catch (Exception exc) {
        // no extractor in classpath
        logger.log(POILogger.WARN, "There is an OLE object entry '", entry.getName(), "', but there is no text extractor for this object type ", "or text extractor factory is not available: ", "" + exc);
        return false;
    }
    try {
        Method getText = extractor.getClass().getMethod("getText");
        String text = (String) getText.invoke(extractor);
        block.appendChild(textDocumentFacade.createText(UNICODECHAR_ZERO_WIDTH_SPACE + text + UNICODECHAR_ZERO_WIDTH_SPACE));
        return true;
    } catch (Exception exc) {
        logger.log(POILogger.ERROR, "Unable to extract text from OLE entry '", entry.getName(), "': ", exc, exc);
        return false;
    }
}
Also used : DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) Method(java.lang.reflect.Method) ParserConfigurationException(javax.xml.parsers.ParserConfigurationException)

Example 17 with DirectoryNode

use of org.apache.poi.poifs.filesystem.DirectoryNode in project poi by apache.

the class TestWordExtractor method testDifferentPOIFS.

/**
     * Tests that we can work with both {@link POIFSFileSystem}
     *  and {@link NPOIFSFileSystem}
     */
@Test
public void testDifferentPOIFS() throws Exception {
    // Open the two filesystems
    File file = docTests.getFile("test2.doc");
    InputStream is = new FileInputStream(file);
    OPOIFSFileSystem opoifs = new OPOIFSFileSystem(is);
    is.close();
    NPOIFSFileSystem npoifs = new NPOIFSFileSystem(file);
    DirectoryNode[] files = { opoifs.getRoot(), npoifs.getRoot() };
    // Open directly 
    for (DirectoryNode dir : files) {
        @SuppressWarnings("resource") WordExtractor extractor = new WordExtractor(dir);
        assertEqualsTrim(p_text1_block, extractor.getText());
    // extractor.close();
    }
    // Open via a HWPFDocument
    for (DirectoryNode dir : files) {
        HWPFDocument doc = new HWPFDocument(dir);
        WordExtractor extractor = new WordExtractor(doc);
        assertEqualsTrim(p_text1_block, extractor.getText());
        extractor.close();
    }
    npoifs.close();
}
Also used : HWPFDocument(org.apache.poi.hwpf.HWPFDocument) NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) OPOIFSFileSystem(org.apache.poi.poifs.filesystem.OPOIFSFileSystem) File(java.io.File) FileInputStream(java.io.FileInputStream) Test(org.junit.Test)

Example 18 with DirectoryNode

use of org.apache.poi.poifs.filesystem.DirectoryNode in project poi by apache.

the class TestWordExtractor method testExtractFromEmbeded.

/**
	 * Test that we can get data from two different embedded word documents
	 */
@Test
public void testExtractFromEmbeded() throws IOException {
    InputStream is = POIDataSamples.getSpreadSheetInstance().openResourceAsStream("excel_with_embeded.xls");
    POIFSFileSystem fs = new POIFSFileSystem(is);
    is.close();
    DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B7");
    DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B2");
    // Should have WordDocument and 1Table
    assertNotNull(dirA.getEntry("1Table"));
    assertNotNull(dirA.getEntry("WordDocument"));
    assertNotNull(dirB.getEntry("1Table"));
    assertNotNull(dirB.getEntry("WordDocument"));
    // Check each in turn
    HWPFDocument docA = new HWPFDocument(dirA);
    WordExtractor extractorA = new WordExtractor(docA);
    assertNotNull(extractorA.getText());
    assertTrue(extractorA.getText().length() > 20);
    assertEqualsTrim("I am a sample document\r\nNot much on me\r\nI am document 1\r\n", extractorA.getText());
    assertEquals("Sample Doc 1", extractorA.getSummaryInformation().getTitle());
    assertEquals("Sample Test", extractorA.getSummaryInformation().getSubject());
    HWPFDocument docB = new HWPFDocument(dirB);
    WordExtractor extractorB = new WordExtractor(docB);
    assertNotNull(extractorB.getText());
    assertTrue(extractorB.getText().length() > 20);
    assertEqualsTrim("I am another sample document\r\nNot much on me\r\nI am document 2\r\n", extractorB.getText());
    assertEquals("Sample Doc 2", extractorB.getSummaryInformation().getTitle());
    assertEquals("Another Sample Test", extractorB.getSummaryInformation().getSubject());
    extractorA.close();
    docA.close();
    extractorB.close();
    docB.close();
    fs.close();
}
Also used : HWPFDocument(org.apache.poi.hwpf.HWPFDocument) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) OPOIFSFileSystem(org.apache.poi.poifs.filesystem.OPOIFSFileSystem) POIFSFileSystem(org.apache.poi.poifs.filesystem.POIFSFileSystem) NPOIFSFileSystem(org.apache.poi.poifs.filesystem.NPOIFSFileSystem) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) Test(org.junit.Test)

Example 19 with DirectoryNode

use of org.apache.poi.poifs.filesystem.DirectoryNode in project poi by apache.

the class TestExcelExtractor method testWithEmbededInOwn.

/**
	 * Excel embeded in excel
	 */
@Test
public void testWithEmbededInOwn() throws Exception {
    POIDataSamples ssSamples = POIDataSamples.getSpreadSheetInstance();
    POIFSFileSystem fs = null;
    HSSFWorkbook wbA = null, wbB = null;
    ExcelExtractor exA = null, exB = null, ex = null;
    try {
        fs = new POIFSFileSystem(ssSamples.getFile("excel_with_embeded.xls"));
        DirectoryNode dirA = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B5");
        DirectoryNode dirB = (DirectoryNode) fs.getRoot().getEntry("MBD0000A3B4");
        wbA = new HSSFWorkbook(dirA, fs, true);
        wbB = new HSSFWorkbook(dirB, fs, true);
        exA = new ExcelExtractor(wbA);
        exB = new ExcelExtractor(wbB);
        assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", exA.getText());
        assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
        assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", exB.getText());
        assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
        // And the base file too
        ex = new ExcelExtractor(fs);
        assertEquals("Sheet1\nI have lots of embeded files in me\nSheet2\nSheet3\n", ex.getText());
        assertEquals("Excel With Embeded", ex.getSummaryInformation().getTitle());
    } finally {
        if (ex != null)
            ex.close();
        if (exB != null)
            exB.close();
        if (exA != null)
            exA.close();
        if (wbB != null)
            wbB.close();
        if (wbA != null)
            wbA.close();
        if (fs != null)
            fs.close();
    }
}
Also used : POIFSFileSystem(org.apache.poi.poifs.filesystem.POIFSFileSystem) POIDataSamples(org.apache.poi.POIDataSamples) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) HSSFWorkbook(org.apache.poi.hssf.usermodel.HSSFWorkbook) Test(org.junit.Test)

Example 20 with DirectoryNode

use of org.apache.poi.poifs.filesystem.DirectoryNode in project poi by apache.

the class TestExcelExtractor method testWithEmbeded.

/**
	 * Embeded in a non-excel file
	 */
@Test
public void testWithEmbeded() throws Exception {
    POIFSFileSystem fs = null;
    HSSFWorkbook wbA = null, wbB = null;
    ExcelExtractor exA = null, exB = null;
    try {
        fs = new POIFSFileSystem(POIDataSamples.getDocumentInstance().getFile("word_with_embeded.doc"));
        DirectoryNode objPool = (DirectoryNode) fs.getRoot().getEntry("ObjectPool");
        DirectoryNode dirA = (DirectoryNode) objPool.getEntry("_1269427460");
        DirectoryNode dirB = (DirectoryNode) objPool.getEntry("_1269427461");
        wbA = new HSSFWorkbook(dirA, fs, true);
        exA = new ExcelExtractor(wbA);
        wbB = new HSSFWorkbook(dirB, fs, true);
        exB = new ExcelExtractor(wbB);
        assertEquals("Sheet1\nTest excel file\nThis is the first file\nSheet2\nSheet3\n", exA.getText());
        assertEquals("Sample Excel", exA.getSummaryInformation().getTitle());
        assertEquals("Sheet1\nAnother excel file\nThis is the second file\nSheet2\nSheet3\n", exB.getText());
        assertEquals("Sample Excel 2", exB.getSummaryInformation().getTitle());
    } finally {
        if (exB != null)
            exB.close();
        if (wbB != null)
            wbB.close();
        if (exA != null)
            exA.close();
        if (wbA != null)
            wbA.close();
        if (fs != null)
            fs.close();
    }
}
Also used : POIFSFileSystem(org.apache.poi.poifs.filesystem.POIFSFileSystem) DirectoryNode(org.apache.poi.poifs.filesystem.DirectoryNode) HSSFWorkbook(org.apache.poi.hssf.usermodel.HSSFWorkbook) Test(org.junit.Test)

Aggregations

DirectoryNode (org.apache.poi.poifs.filesystem.DirectoryNode)47 Test (org.junit.Test)16 InputStream (java.io.InputStream)15 POIFSFileSystem (org.apache.poi.poifs.filesystem.POIFSFileSystem)13 NPOIFSFileSystem (org.apache.poi.poifs.filesystem.NPOIFSFileSystem)12 Entry (org.apache.poi.poifs.filesystem.Entry)9 ByteArrayInputStream (java.io.ByteArrayInputStream)8 ByteArrayOutputStream (java.io.ByteArrayOutputStream)8 IOException (java.io.IOException)8 OPOIFSFileSystem (org.apache.poi.poifs.filesystem.OPOIFSFileSystem)6 FileInputStream (java.io.FileInputStream)5 FileNotFoundException (java.io.FileNotFoundException)5 DocumentInputStream (org.apache.poi.poifs.filesystem.DocumentInputStream)5 HSSFWorkbook (org.apache.poi.hssf.usermodel.HSSFWorkbook)4 HWPFDocument (org.apache.poi.hwpf.HWPFDocument)4 File (java.io.File)3 ArrayList (java.util.ArrayList)3 AttachmentChunks (org.apache.poi.hsmf.datatypes.AttachmentChunks)3 DirectoryEntry (org.apache.poi.poifs.filesystem.DirectoryEntry)3 DocumentEntry (org.apache.poi.poifs.filesystem.DocumentEntry)3