Search in sources :

Example 11 with XWPFHeaderFooterPolicy

use of org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy in project poi by apache.

the class XWPFWordExtractor method getText.

public String getText() {
    StringBuffer text = new StringBuffer();
    XWPFHeaderFooterPolicy hfPolicy = document.getHeaderFooterPolicy();
    // Start out with all headers
    extractHeaders(text, hfPolicy);
    // Process all body elements
    for (IBodyElement e : document.getBodyElements()) {
        appendBodyElementText(text, e);
        text.append('\n');
    }
    // Finish up with all the footers
    extractFooters(text, hfPolicy);
    return text.toString();
}
Also used : IBodyElement(org.apache.poi.xwpf.usermodel.IBodyElement) XWPFHeaderFooterPolicy(org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy)

Example 12 with XWPFHeaderFooterPolicy

use of org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy in project tika by apache.

the class XWPFWordExtractorDecorator method extractParagraph.

private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager, XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException {
    // If this paragraph is actually a whole new section, then
    //  it could have its own headers and footers
    // Check and handle if so
    XWPFHeaderFooterPolicy headerFooterPolicy = null;
    if (paragraph.getCTP().getPPr() != null) {
        CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
        if (ctSectPr != null) {
            headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
            extractHeaders(xhtml, headerFooterPolicy, listManager);
        }
    }
    // Is this a paragraph, or a heading?
    String tag = "p";
    String styleClass = null;
    //TIKA-2144 check that styles is not null
    if (paragraph.getStyleID() != null && styles != null) {
        XWPFStyle style = styles.getStyle(paragraph.getStyleID());
        if (style != null && style.getName() != null) {
            TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(style.getName(), paragraph.getPartType() == BodyType.TABLECELL);
            tag = tas.getTag();
            styleClass = tas.getStyleClass();
        }
    }
    if (styleClass == null) {
        xhtml.startElement(tag);
    } else {
        xhtml.startElement(tag, "class", styleClass);
    }
    writeParagraphNumber(paragraph, listManager, xhtml);
    // TODO: replace w/ XPath/XQuery:
    for (XWPFRun run : paragraph.getRuns()) {
        XmlCursor c = run.getCTR().newCursor();
        c.selectPath("./*");
        while (c.toNextSelection()) {
            XmlObject o = c.getObject();
            if (o instanceof CTObject) {
                XmlCursor c2 = o.newCursor();
                c2.selectPath("./*");
                while (c2.toNextSelection()) {
                    XmlObject o2 = c2.getObject();
                    XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
                    if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) {
                        // Type is "Embed"
                        XmlObject relIDAtt = o2.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
                        if (relIDAtt != null) {
                            String relID = relIDAtt.getDomNode().getNodeValue();
                            AttributesImpl attributes = new AttributesImpl();
                            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                            attributes.addAttribute("", "id", "id", "CDATA", relID);
                            xhtml.startElement("div", attributes);
                            xhtml.endElement("div");
                        }
                    }
                }
                c2.dispose();
            }
        }
        c.dispose();
    }
    //  we just put them in the correct paragraph)
    for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) {
        CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
        xhtml.startElement("a", "name", bookmark.getName());
        xhtml.endElement("a");
    }
    TmpFormatting fmtg = new TmpFormatting(false, false);
    //hyperlinks may or may not have hyperlink ids
    String lastHyperlinkId = null;
    boolean inHyperlink = false;
    // Do the iruns
    for (IRunElement run : paragraph.getIRuns()) {
        if (run instanceof XWPFHyperlinkRun) {
            XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run;
            if (hyperlinkRun.getHyperlinkId() == null || !hyperlinkRun.getHyperlinkId().equals(lastHyperlinkId)) {
                if (inHyperlink) {
                    //close out the old one
                    xhtml.endElement("a");
                    inHyperlink = false;
                }
                lastHyperlinkId = hyperlinkRun.getHyperlinkId();
                fmtg = closeStyleTags(xhtml, fmtg);
                XWPFHyperlink link = hyperlinkRun.getHyperlink(document);
                if (link != null && link.getURL() != null) {
                    xhtml.startElement("a", "href", link.getURL());
                    inHyperlink = true;
                } else if (hyperlinkRun.getAnchor() != null && hyperlinkRun.getAnchor().length() > 0) {
                    xhtml.startElement("a", "href", "#" + hyperlinkRun.getAnchor());
                    inHyperlink = true;
                }
            }
        } else if (inHyperlink) {
            //if this isn't a hyperlink, but the last one was
            closeStyleTags(xhtml, fmtg);
            xhtml.endElement("a");
            lastHyperlinkId = null;
            inHyperlink = false;
        }
        if (run instanceof XWPFSDT) {
            fmtg = closeStyleTags(xhtml, fmtg);
            processSDTRun((XWPFSDT) run, xhtml);
            //for now, we're ignoring formatting in sdt
            //if you hit an sdt reset to false
            fmtg.setBold(false);
            fmtg.setItalic(false);
        } else {
            fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg);
        }
    }
    closeStyleTags(xhtml, fmtg);
    if (inHyperlink) {
        xhtml.endElement("a");
    }
    // Now do any comments for the paragraph
    XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
    String commentText = comments.getCommentText();
    if (commentText != null && commentText.length() > 0) {
        xhtml.characters(commentText);
    }
    String footnameText = paragraph.getFootnoteText();
    if (footnameText != null && footnameText.length() > 0) {
        xhtml.characters(footnameText + "\n");
    }
    // Also extract any paragraphs embedded in text boxes:
    if (config.getIncludeShapeBasedContent()) {
        for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
            extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
        }
    }
    // Finish this paragraph
    xhtml.endElement(tag);
    if (headerFooterPolicy != null) {
        extractFooters(xhtml, headerFooterPolicy, listManager);
    }
}
Also used : XWPFHyperlink(org.apache.poi.xwpf.usermodel.XWPFHyperlink) XWPFParagraph(org.apache.poi.xwpf.usermodel.XWPFParagraph) XWPFCommentsDecorator(org.apache.poi.xwpf.model.XWPFCommentsDecorator) XWPFStyle(org.apache.poi.xwpf.usermodel.XWPFStyle) QName(javax.xml.namespace.QName) XWPFHyperlinkRun(org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun) CTBookmark(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark) XWPFHeaderFooterPolicy(org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy) CTSectPr(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr) XWPFSDT(org.apache.poi.xwpf.usermodel.XWPFSDT) XmlCursor(org.apache.xmlbeans.XmlCursor) CTObject(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject) AttributesImpl(org.xml.sax.helpers.AttributesImpl) XWPFRun(org.apache.poi.xwpf.usermodel.XWPFRun) TagAndStyle(org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle) XmlObject(org.apache.xmlbeans.XmlObject) IRunElement(org.apache.poi.xwpf.usermodel.IRunElement)

Example 13 with XWPFHeaderFooterPolicy

use of org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy in project poi by apache.

the class TestXWPFHeader method testImageInHeader.

@Test
public void testImageInHeader() throws IOException {
    XWPFDocument sampleDoc = XWPFTestDataSamples.openSampleDocument("headerPic.docx");
    XWPFHeaderFooterPolicy policy = sampleDoc.getHeaderFooterPolicy();
    XWPFHeader header = policy.getDefaultHeader();
    assertNotNull(header.getRelations());
    assertEquals(1, header.getRelations().size());
}
Also used : XWPFHeaderFooterPolicy(org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy) Test(org.junit.Test)

Example 14 with XWPFHeaderFooterPolicy

use of org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy in project poi by apache.

the class TestXWPFHeader method testSetHeader.

@Test
public void testSetHeader() throws IOException {
    XWPFDocument sampleDoc = XWPFTestDataSamples.openSampleDocument("SampleDoc.docx");
    // no header is set (yet)
    XWPFHeaderFooterPolicy policy = sampleDoc.getHeaderFooterPolicy();
    assertNull(policy.getDefaultHeader());
    assertNull(policy.getFirstPageHeader());
    assertNull(policy.getDefaultFooter());
    assertNull(policy.getFirstPageFooter());
    CTP ctP1 = CTP.Factory.newInstance();
    CTR ctR1 = ctP1.addNewR();
    CTText t = ctR1.addNewT();
    String tText = "Paragraph in header";
    t.setStringValue(tText);
    // Commented MB 23 May 2010
    //CTP ctP2 = CTP.Factory.newInstance();
    //CTR ctR2 = ctP2.addNewR();
    //CTText t2 = ctR2.addNewT();
    //t2.setStringValue("Second paragraph.. for footer");
    // Create two paragraphs for insertion into the footer.
    // Previously only one was inserted MB 23 May 2010
    CTP ctP2 = CTP.Factory.newInstance();
    CTR ctR2 = ctP2.addNewR();
    CTText t2 = ctR2.addNewT();
    t2.setStringValue("First paragraph for the footer");
    CTP ctP3 = CTP.Factory.newInstance();
    CTR ctR3 = ctP3.addNewR();
    CTText t3 = ctR3.addNewT();
    t3.setStringValue("Second paragraph for the footer");
    XWPFParagraph p1 = new XWPFParagraph(ctP1, sampleDoc);
    XWPFParagraph[] pars = new XWPFParagraph[1];
    pars[0] = p1;
    XWPFParagraph p2 = new XWPFParagraph(ctP2, sampleDoc);
    XWPFParagraph p3 = new XWPFParagraph(ctP3, sampleDoc);
    XWPFParagraph[] pars2 = new XWPFParagraph[2];
    pars2[0] = p2;
    pars2[1] = p3;
    // Set headers
    XWPFHeader headerD = policy.createHeader(XWPFHeaderFooterPolicy.DEFAULT, pars);
    XWPFHeader headerF = policy.createHeader(XWPFHeaderFooterPolicy.FIRST);
    // Set a default footer and capture the returned XWPFFooter object.
    XWPFFooter footerD = policy.createFooter(XWPFHeaderFooterPolicy.DEFAULT, pars2);
    XWPFFooter footerF = policy.createFooter(XWPFHeaderFooterPolicy.FIRST);
    // Ensure the headers and footer were set correctly....
    assertNotNull(policy.getDefaultHeader());
    assertNotNull(policy.getFirstPageHeader());
    assertNotNull(policy.getDefaultFooter());
    assertNotNull(policy.getFirstPageFooter());
    // ....and that the footer object captured above contains two
    // paragraphs of text.
    assertEquals(2, footerD.getParagraphs().size());
    assertEquals(0, footerF.getParagraphs().size());
    // Check the header created with the paragraph got them, and the one
    // created without got none
    assertEquals(1, headerD.getParagraphs().size());
    assertEquals(tText, headerD.getParagraphs().get(0).getText());
    assertEquals(0, headerF.getParagraphs().size());
    // As an additional check, recover the defauls footer and
    // make sure that it contains two paragraphs of text and that
    // both do hold what is expected.
    footerD = policy.getDefaultFooter();
    XWPFParagraph[] paras = footerD.getParagraphs().toArray(new XWPFParagraph[0]);
    assertEquals(2, paras.length);
    assertEquals("First paragraph for the footer", paras[0].getText());
    assertEquals("Second paragraph for the footer", paras[1].getText());
    // Add some text to the empty header
    String fText1 = "New Text!";
    String fText2 = "More Text!";
    headerF.createParagraph().insertNewRun(0).setText(fText1);
    headerF.createParagraph().insertNewRun(0).setText(fText2);
    //        headerF.getParagraphs().get(0).insertNewRun(0).setText(fText1);
    // Check it
    assertEquals(tText, headerD.getParagraphs().get(0).getText());
    assertEquals(fText1, headerF.getParagraphs().get(0).getText());
    assertEquals(fText2, headerF.getParagraphs().get(1).getText());
    // Save, re-open, ensure it's all still there
    XWPFDocument reopened = XWPFTestDataSamples.writeOutAndReadBack(sampleDoc);
    policy = reopened.getHeaderFooterPolicy();
    assertNotNull(policy.getDefaultHeader());
    assertNotNull(policy.getFirstPageHeader());
    assertNull(policy.getEvenPageHeader());
    assertNotNull(policy.getDefaultFooter());
    assertNotNull(policy.getFirstPageFooter());
    assertNull(policy.getEvenPageFooter());
    // Check the new headers still have their text
    headerD = policy.getDefaultHeader();
    headerF = policy.getFirstPageHeader();
    assertEquals(tText, headerD.getParagraphs().get(0).getText());
    assertEquals(fText1, headerF.getParagraphs().get(0).getText());
    assertEquals(fText2, headerF.getParagraphs().get(1).getText());
    // Check the new footers have their new text too
    footerD = policy.getDefaultFooter();
    paras = footerD.getParagraphs().toArray(new XWPFParagraph[0]);
    footerF = policy.getFirstPageFooter();
    assertEquals(2, paras.length);
    assertEquals("First paragraph for the footer", paras[0].getText());
    assertEquals("Second paragraph for the footer", paras[1].getText());
    assertEquals(1, footerF.getParagraphs().size());
}
Also used : CTR(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR) CTText(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText) XWPFHeaderFooterPolicy(org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy) CTP(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP) Test(org.junit.Test)

Example 15 with XWPFHeaderFooterPolicy

use of org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy in project poi by apache.

the class TestXWPFHeader method testSimpleHeader.

@Test
public void testSimpleHeader() throws IOException {
    XWPFDocument sampleDoc = XWPFTestDataSamples.openSampleDocument("headerFooter.docx");
    XWPFHeaderFooterPolicy policy = sampleDoc.getHeaderFooterPolicy();
    XWPFHeader header = policy.getDefaultHeader();
    XWPFFooter footer = policy.getDefaultFooter();
    assertNotNull(header);
    assertNotNull(footer);
}
Also used : XWPFHeaderFooterPolicy(org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy) Test(org.junit.Test)

Aggregations

XWPFHeaderFooterPolicy (org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy)17 Test (org.junit.Test)5 XWPFParagraph (org.apache.poi.xwpf.usermodel.XWPFParagraph)3 XWPFRun (org.apache.poi.xwpf.usermodel.XWPFRun)3 File (java.io.File)2 FileOutputStream (java.io.FileOutputStream)2 XWPFCommentsDecorator (org.apache.poi.xwpf.model.XWPFCommentsDecorator)2 IRunElement (org.apache.poi.xwpf.usermodel.IRunElement)2 XWPFDocument (org.apache.poi.xwpf.usermodel.XWPFDocument)2 XWPFHyperlink (org.apache.poi.xwpf.usermodel.XWPFHyperlink)2 XWPFHyperlinkRun (org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun)2 XmlCursor (org.apache.xmlbeans.XmlCursor)2 XmlObject (org.apache.xmlbeans.XmlObject)2 CTP (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP)2 CTSectPr (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr)2 CTText (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 IOException (java.io.IOException)1 OutputStream (java.io.OutputStream)1 QName (javax.xml.namespace.QName)1