Search in sources :

Example 71 with XmlObject

use of org.apache.xmlbeans.XmlObject in project poi by apache.

the class XWPFParagraph method searchText.

// TODO Add methods to allow adding a HyperlinkRun or a FieldRun
/**
     * this methods parse the paragraph and search for the string searched.
     * If it finds the string, it will return true and the position of the String
     * will be saved in the parameter startPos.
     *
     * @param searched
     * @param startPos
     */
public TextSegement searchText(String searched, PositionInParagraph startPos) {
    int startRun = startPos.getRun(), startText = startPos.getText(), startChar = startPos.getChar();
    int beginRunPos = 0, candCharPos = 0;
    boolean newList = false;
    CTR[] rArray = paragraph.getRArray();
    for (int runPos = startRun; runPos < rArray.length; runPos++) {
        int beginTextPos = 0, beginCharPos = 0, textPos = 0, charPos = 0;
        CTR ctRun = rArray[runPos];
        XmlCursor c = ctRun.newCursor();
        c.selectPath("./*");
        try {
            while (c.toNextSelection()) {
                XmlObject o = c.getObject();
                if (o instanceof CTText) {
                    if (textPos >= startText) {
                        String candidate = ((CTText) o).getStringValue();
                        if (runPos == startRun) {
                            charPos = startChar;
                        } else {
                            charPos = 0;
                        }
                        for (; charPos < candidate.length(); charPos++) {
                            if ((candidate.charAt(charPos) == searched.charAt(0)) && (candCharPos == 0)) {
                                beginTextPos = textPos;
                                beginCharPos = charPos;
                                beginRunPos = runPos;
                                newList = true;
                            }
                            if (candidate.charAt(charPos) == searched.charAt(candCharPos)) {
                                if (candCharPos + 1 < searched.length()) {
                                    candCharPos++;
                                } else if (newList) {
                                    TextSegement segement = new TextSegement();
                                    segement.setBeginRun(beginRunPos);
                                    segement.setBeginText(beginTextPos);
                                    segement.setBeginChar(beginCharPos);
                                    segement.setEndRun(runPos);
                                    segement.setEndText(textPos);
                                    segement.setEndChar(charPos);
                                    return segement;
                                }
                            } else {
                                candCharPos = 0;
                            }
                        }
                    }
                    textPos++;
                } else if (o instanceof CTProofErr) {
                    c.removeXml();
                } else if (o instanceof CTRPr) {
                //do nothing
                } else {
                    candCharPos = 0;
                }
            }
        } finally {
            c.dispose();
        }
    }
    return null;
}
Also used : XmlObject(org.apache.xmlbeans.XmlObject) XmlCursor(org.apache.xmlbeans.XmlCursor)

Example 72 with XmlObject

use of org.apache.xmlbeans.XmlObject in project poi by apache.

the class XWPFParagraph method buildRunsInOrderFromXml.

/**
     * Identifies (in order) the parts of the paragraph /
     * sub-paragraph that correspond to character text
     * runs, and builds the appropriate runs for these.
     */
@SuppressWarnings("deprecation")
private void buildRunsInOrderFromXml(XmlObject object) {
    XmlCursor c = object.newCursor();
    c.selectPath("child::*");
    while (c.toNextSelection()) {
        XmlObject o = c.getObject();
        if (o instanceof CTR) {
            XWPFRun r = new XWPFRun((CTR) o, this);
            runs.add(r);
            iruns.add(r);
        }
        if (o instanceof CTHyperlink) {
            CTHyperlink link = (CTHyperlink) o;
            for (CTR r : link.getRArray()) {
                XWPFHyperlinkRun hr = new XWPFHyperlinkRun(link, r, this);
                runs.add(hr);
                iruns.add(hr);
            }
        }
        if (o instanceof CTSimpleField) {
            CTSimpleField field = (CTSimpleField) o;
            for (CTR r : field.getRArray()) {
                XWPFFieldRun fr = new XWPFFieldRun(field, r, this);
                runs.add(fr);
                iruns.add(fr);
            }
        }
        if (o instanceof CTSdtBlock) {
            XWPFSDT cc = new XWPFSDT((CTSdtBlock) o, part);
            iruns.add(cc);
        }
        if (o instanceof CTSdtRun) {
            XWPFSDT cc = new XWPFSDT((CTSdtRun) o, part);
            iruns.add(cc);
        }
        if (o instanceof CTRunTrackChange) {
            for (CTR r : ((CTRunTrackChange) o).getRArray()) {
                XWPFRun cr = new XWPFRun(r, this);
                runs.add(cr);
                iruns.add(cr);
            }
        }
        if (o instanceof CTSmartTagRun) {
            // Smart Tags can be nested many times.
            // This implementation does not preserve the tagging information
            buildRunsInOrderFromXml(o);
        }
    }
    c.dispose();
}
Also used : XmlCursor(org.apache.xmlbeans.XmlCursor) XmlObject(org.apache.xmlbeans.XmlObject)

Example 73 with XmlObject

use of org.apache.xmlbeans.XmlObject in project poi by apache.

the class XWPFHeader method onDocumentRead.

/**
     * reads the document
     *
     * @throws IOException
     */
@Override
protected void onDocumentRead() throws IOException {
    super.onDocumentRead();
    HdrDocument hdrDocument = null;
    InputStream is = null;
    try {
        is = getPackagePart().getInputStream();
        hdrDocument = HdrDocument.Factory.parse(is, DEFAULT_XML_OPTIONS);
        headerFooter = hdrDocument.getHdr();
        // parse the document with cursor and add
        // the XmlObject to its lists
        XmlCursor cursor = headerFooter.newCursor();
        cursor.selectPath("./*");
        while (cursor.toNextSelection()) {
            XmlObject o = cursor.getObject();
            if (o instanceof CTP) {
                XWPFParagraph p = new XWPFParagraph((CTP) o, this);
                paragraphs.add(p);
                bodyElements.add(p);
            }
            if (o instanceof CTTbl) {
                XWPFTable t = new XWPFTable((CTTbl) o, this);
                tables.add(t);
                bodyElements.add(t);
            }
            if (o instanceof CTSdtBlock) {
                XWPFSDT c = new XWPFSDT((CTSdtBlock) o, this);
                bodyElements.add(c);
            }
        }
        cursor.dispose();
    } catch (XmlException e) {
        throw new POIXMLException(e);
    } finally {
        if (is != null) {
            is.close();
        }
    }
}
Also used : HdrDocument(org.openxmlformats.schemas.wordprocessingml.x2006.main.HdrDocument) InputStream(java.io.InputStream) XmlException(org.apache.xmlbeans.XmlException) XmlObject(org.apache.xmlbeans.XmlObject) CTTbl(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl) POIXMLException(org.apache.poi.POIXMLException) CTP(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP) XmlCursor(org.apache.xmlbeans.XmlCursor) CTSdtBlock(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSdtBlock)

Example 74 with XmlObject

use of org.apache.xmlbeans.XmlObject in project tika by apache.

the class XWPFWordExtractorDecorator method extractParagraph.

private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager, XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException {
    // If this paragraph is actually a whole new section, then
    //  it could have its own headers and footers
    // Check and handle if so
    XWPFHeaderFooterPolicy headerFooterPolicy = null;
    if (paragraph.getCTP().getPPr() != null) {
        CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
        if (ctSectPr != null) {
            headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
            extractHeaders(xhtml, headerFooterPolicy, listManager);
        }
    }
    // Is this a paragraph, or a heading?
    String tag = "p";
    String styleClass = null;
    //TIKA-2144 check that styles is not null
    if (paragraph.getStyleID() != null && styles != null) {
        XWPFStyle style = styles.getStyle(paragraph.getStyleID());
        if (style != null && style.getName() != null) {
            TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(style.getName(), paragraph.getPartType() == BodyType.TABLECELL);
            tag = tas.getTag();
            styleClass = tas.getStyleClass();
        }
    }
    if (styleClass == null) {
        xhtml.startElement(tag);
    } else {
        xhtml.startElement(tag, "class", styleClass);
    }
    writeParagraphNumber(paragraph, listManager, xhtml);
    // TODO: replace w/ XPath/XQuery:
    for (XWPFRun run : paragraph.getRuns()) {
        XmlCursor c = run.getCTR().newCursor();
        c.selectPath("./*");
        while (c.toNextSelection()) {
            XmlObject o = c.getObject();
            if (o instanceof CTObject) {
                XmlCursor c2 = o.newCursor();
                c2.selectPath("./*");
                while (c2.toNextSelection()) {
                    XmlObject o2 = c2.getObject();
                    XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
                    if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) {
                        // Type is "Embed"
                        XmlObject relIDAtt = o2.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
                        if (relIDAtt != null) {
                            String relID = relIDAtt.getDomNode().getNodeValue();
                            AttributesImpl attributes = new AttributesImpl();
                            attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                            attributes.addAttribute("", "id", "id", "CDATA", relID);
                            xhtml.startElement("div", attributes);
                            xhtml.endElement("div");
                        }
                    }
                }
                c2.dispose();
            }
        }
        c.dispose();
    }
    //  we just put them in the correct paragraph)
    for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) {
        CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
        xhtml.startElement("a", "name", bookmark.getName());
        xhtml.endElement("a");
    }
    TmpFormatting fmtg = new TmpFormatting(false, false);
    //hyperlinks may or may not have hyperlink ids
    String lastHyperlinkId = null;
    boolean inHyperlink = false;
    // Do the iruns
    for (IRunElement run : paragraph.getIRuns()) {
        if (run instanceof XWPFHyperlinkRun) {
            XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run;
            if (hyperlinkRun.getHyperlinkId() == null || !hyperlinkRun.getHyperlinkId().equals(lastHyperlinkId)) {
                if (inHyperlink) {
                    //close out the old one
                    xhtml.endElement("a");
                    inHyperlink = false;
                }
                lastHyperlinkId = hyperlinkRun.getHyperlinkId();
                fmtg = closeStyleTags(xhtml, fmtg);
                XWPFHyperlink link = hyperlinkRun.getHyperlink(document);
                if (link != null && link.getURL() != null) {
                    xhtml.startElement("a", "href", link.getURL());
                    inHyperlink = true;
                } else if (hyperlinkRun.getAnchor() != null && hyperlinkRun.getAnchor().length() > 0) {
                    xhtml.startElement("a", "href", "#" + hyperlinkRun.getAnchor());
                    inHyperlink = true;
                }
            }
        } else if (inHyperlink) {
            //if this isn't a hyperlink, but the last one was
            closeStyleTags(xhtml, fmtg);
            xhtml.endElement("a");
            lastHyperlinkId = null;
            inHyperlink = false;
        }
        if (run instanceof XWPFSDT) {
            fmtg = closeStyleTags(xhtml, fmtg);
            processSDTRun((XWPFSDT) run, xhtml);
            //for now, we're ignoring formatting in sdt
            //if you hit an sdt reset to false
            fmtg.setBold(false);
            fmtg.setItalic(false);
        } else {
            fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg);
        }
    }
    closeStyleTags(xhtml, fmtg);
    if (inHyperlink) {
        xhtml.endElement("a");
    }
    // Now do any comments for the paragraph
    XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
    String commentText = comments.getCommentText();
    if (commentText != null && commentText.length() > 0) {
        xhtml.characters(commentText);
    }
    String footnameText = paragraph.getFootnoteText();
    if (footnameText != null && footnameText.length() > 0) {
        xhtml.characters(footnameText + "\n");
    }
    // Also extract any paragraphs embedded in text boxes:
    if (config.getIncludeShapeBasedContent()) {
        for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
            extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
        }
    }
    // Finish this paragraph
    xhtml.endElement(tag);
    if (headerFooterPolicy != null) {
        extractFooters(xhtml, headerFooterPolicy, listManager);
    }
}
Also used : XWPFHyperlink(org.apache.poi.xwpf.usermodel.XWPFHyperlink) XWPFParagraph(org.apache.poi.xwpf.usermodel.XWPFParagraph) XWPFCommentsDecorator(org.apache.poi.xwpf.model.XWPFCommentsDecorator) XWPFStyle(org.apache.poi.xwpf.usermodel.XWPFStyle) QName(javax.xml.namespace.QName) XWPFHyperlinkRun(org.apache.poi.xwpf.usermodel.XWPFHyperlinkRun) CTBookmark(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark) XWPFHeaderFooterPolicy(org.apache.poi.xwpf.model.XWPFHeaderFooterPolicy) CTSectPr(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr) XWPFSDT(org.apache.poi.xwpf.usermodel.XWPFSDT) XmlCursor(org.apache.xmlbeans.XmlCursor) CTObject(org.openxmlformats.schemas.wordprocessingml.x2006.main.CTObject) AttributesImpl(org.xml.sax.helpers.AttributesImpl) XWPFRun(org.apache.poi.xwpf.usermodel.XWPFRun) TagAndStyle(org.apache.tika.parser.microsoft.WordExtractor.TagAndStyle) XmlObject(org.apache.xmlbeans.XmlObject) IRunElement(org.apache.poi.xwpf.usermodel.IRunElement)

Example 75 with XmlObject

use of org.apache.xmlbeans.XmlObject in project tika by apache.

the class XSLFPowerPointExtractorDecorator method extractContent.

private void extractContent(List<? extends XSLFShape> shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc) throws SAXException {
    for (XSLFShape sh : shapes) {
        if (sh instanceof XSLFTextShape) {
            XSLFTextShape txt = (XSLFTextShape) sh;
            Placeholder ph = txt.getTextType();
            if (skipPlaceholders && ph != null) {
                continue;
            }
            boolean inHyperlink = false;
            for (XSLFTextParagraph p : txt.getTextParagraphs()) {
                xhtml.startElement("p");
                for (XSLFTextRun run : p.getTextRuns()) {
                    //TODO: add check for targetmode=external into POI
                    //then check to confirm that the urls are actually
                    //external and not footnote refs via the current hack
                    Hyperlink hyperlink = run.getHyperlink();
                    if (hyperlink != null && hyperlink.getAddress() != null && !hyperlink.getAddress().contains("#_ftn")) {
                        xhtml.startElement("a", "href", hyperlink.getAddress());
                        inHyperlink = true;
                    }
                    xhtml.characters(run.getRawText());
                    if (inHyperlink == true) {
                        xhtml.endElement("a");
                    }
                    inHyperlink = false;
                }
                xhtml.endElement("p");
            }
        } else if (sh instanceof XSLFGroupShape) {
            // recurse into groups of shapes
            XSLFGroupShape group = (XSLFGroupShape) sh;
            extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc);
        } else if (sh instanceof XSLFTable) {
            //unlike tables in Word, ppt/x can't have recursive tables...I don't think
            extractTable((XSLFTable) sh, xhtml);
        } else if (sh instanceof XSLFGraphicFrame) {
            XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
            XmlObject[] sp = frame.getXmlObject().selectPath("declare namespace p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
            if (sp != null) {
                for (XmlObject emb : sp) {
                    XmlObject relIDAtt = emb.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
                    if (relIDAtt != null) {
                        String relID = relIDAtt.getDomNode().getNodeValue();
                        if (slideDesc != null) {
                            relID = slideDesc + relID;
                        }
                        AttributesImpl attributes = new AttributesImpl();
                        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                        attributes.addAttribute("", "id", "id", "CDATA", relID);
                        xhtml.startElement("div", attributes);
                        xhtml.endElement("div");
                    }
                }
            }
        } else if (sh instanceof XSLFPictureShape) {
            if (!skipPlaceholders && (sh.getXmlObject() instanceof CTPicture)) {
                CTPicture ctPic = ((CTPicture) sh.getXmlObject());
                if (ctPic.getBlipFill() != null && ctPic.getBlipFill().getBlip() != null) {
                    String relID = ctPic.getBlipFill().getBlip().getEmbed();
                    if (relID != null) {
                        if (slideDesc != null) {
                            relID = slideDesc + relID;
                        }
                        AttributesImpl attributes = new AttributesImpl();
                        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                        attributes.addAttribute("", "id", "id", "CDATA", relID);
                        xhtml.startElement("div", attributes);
                        xhtml.endElement("div");
                    }
                }
            }
        }
    }
}
Also used : Placeholder(org.apache.poi.sl.usermodel.Placeholder) QName(javax.xml.namespace.QName) AttributesImpl(org.xml.sax.helpers.AttributesImpl) CTPicture(org.openxmlformats.schemas.presentationml.x2006.main.CTPicture) XmlObject(org.apache.xmlbeans.XmlObject) Hyperlink(org.apache.poi.common.usermodel.Hyperlink)

Aggregations

XmlObject (org.apache.xmlbeans.XmlObject)102 XmlCursor (org.apache.xmlbeans.XmlCursor)49 XmlException (org.apache.xmlbeans.XmlException)17 Test (org.junit.Test)14 CTTbl (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTTbl)13 CTAxDataSource (org.openxmlformats.schemas.drawingml.x2006.chart.CTAxDataSource)12 CTNumDataSource (org.openxmlformats.schemas.drawingml.x2006.chart.CTNumDataSource)12 DefaultExchange (org.apache.camel.impl.DefaultExchange)10 ArrayList (java.util.ArrayList)9 DefaultCamelContext (org.apache.camel.impl.DefaultCamelContext)9 CTP (org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP)9 CTPicture (org.openxmlformats.schemas.presentationml.x2006.main.CTPicture)7 IOException (java.io.IOException)6 QName (javax.xml.namespace.QName)6 POIXMLException (org.apache.poi.POIXMLException)6 CTShapeProperties (org.openxmlformats.schemas.drawingml.x2006.main.CTShapeProperties)6 CTGroupShape (org.openxmlformats.schemas.presentationml.x2006.main.CTGroupShape)6 CTGraphicalObjectFrame (org.openxmlformats.schemas.presentationml.x2006.main.CTGraphicalObjectFrame)5 Node (org.w3c.dom.Node)5 CTConnector (org.openxmlformats.schemas.presentationml.x2006.main.CTConnector)4