use of org.apache.xmlbeans.XmlObject in project poi by apache.
the class XWPFParagraph method searchText.
// TODO Add methods to allow adding a HyperlinkRun or a FieldRun
/**
* this methods parse the paragraph and search for the string searched.
* If it finds the string, it will return true and the position of the String
* will be saved in the parameter startPos.
*
* @param searched
* @param startPos
*/
public TextSegement searchText(String searched, PositionInParagraph startPos) {
int startRun = startPos.getRun(), startText = startPos.getText(), startChar = startPos.getChar();
int beginRunPos = 0, candCharPos = 0;
boolean newList = false;
CTR[] rArray = paragraph.getRArray();
for (int runPos = startRun; runPos < rArray.length; runPos++) {
int beginTextPos = 0, beginCharPos = 0, textPos = 0, charPos = 0;
CTR ctRun = rArray[runPos];
XmlCursor c = ctRun.newCursor();
c.selectPath("./*");
try {
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if (o instanceof CTText) {
if (textPos >= startText) {
String candidate = ((CTText) o).getStringValue();
if (runPos == startRun) {
charPos = startChar;
} else {
charPos = 0;
}
for (; charPos < candidate.length(); charPos++) {
if ((candidate.charAt(charPos) == searched.charAt(0)) && (candCharPos == 0)) {
beginTextPos = textPos;
beginCharPos = charPos;
beginRunPos = runPos;
newList = true;
}
if (candidate.charAt(charPos) == searched.charAt(candCharPos)) {
if (candCharPos + 1 < searched.length()) {
candCharPos++;
} else if (newList) {
TextSegement segement = new TextSegement();
segement.setBeginRun(beginRunPos);
segement.setBeginText(beginTextPos);
segement.setBeginChar(beginCharPos);
segement.setEndRun(runPos);
segement.setEndText(textPos);
segement.setEndChar(charPos);
return segement;
}
} else {
candCharPos = 0;
}
}
}
textPos++;
} else if (o instanceof CTProofErr) {
c.removeXml();
} else if (o instanceof CTRPr) {
//do nothing
} else {
candCharPos = 0;
}
}
} finally {
c.dispose();
}
}
return null;
}
use of org.apache.xmlbeans.XmlObject in project poi by apache.
the class XWPFParagraph method buildRunsInOrderFromXml.
/**
* Identifies (in order) the parts of the paragraph /
* sub-paragraph that correspond to character text
* runs, and builds the appropriate runs for these.
*/
@SuppressWarnings("deprecation")
private void buildRunsInOrderFromXml(XmlObject object) {
XmlCursor c = object.newCursor();
c.selectPath("child::*");
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if (o instanceof CTR) {
XWPFRun r = new XWPFRun((CTR) o, this);
runs.add(r);
iruns.add(r);
}
if (o instanceof CTHyperlink) {
CTHyperlink link = (CTHyperlink) o;
for (CTR r : link.getRArray()) {
XWPFHyperlinkRun hr = new XWPFHyperlinkRun(link, r, this);
runs.add(hr);
iruns.add(hr);
}
}
if (o instanceof CTSimpleField) {
CTSimpleField field = (CTSimpleField) o;
for (CTR r : field.getRArray()) {
XWPFFieldRun fr = new XWPFFieldRun(field, r, this);
runs.add(fr);
iruns.add(fr);
}
}
if (o instanceof CTSdtBlock) {
XWPFSDT cc = new XWPFSDT((CTSdtBlock) o, part);
iruns.add(cc);
}
if (o instanceof CTSdtRun) {
XWPFSDT cc = new XWPFSDT((CTSdtRun) o, part);
iruns.add(cc);
}
if (o instanceof CTRunTrackChange) {
for (CTR r : ((CTRunTrackChange) o).getRArray()) {
XWPFRun cr = new XWPFRun(r, this);
runs.add(cr);
iruns.add(cr);
}
}
if (o instanceof CTSmartTagRun) {
// Smart Tags can be nested many times.
// This implementation does not preserve the tagging information
buildRunsInOrderFromXml(o);
}
}
c.dispose();
}
use of org.apache.xmlbeans.XmlObject in project poi by apache.
the class XWPFHeader method onDocumentRead.
/**
* reads the document
*
* @throws IOException
*/
@Override
protected void onDocumentRead() throws IOException {
super.onDocumentRead();
HdrDocument hdrDocument = null;
InputStream is = null;
try {
is = getPackagePart().getInputStream();
hdrDocument = HdrDocument.Factory.parse(is, DEFAULT_XML_OPTIONS);
headerFooter = hdrDocument.getHdr();
// parse the document with cursor and add
// the XmlObject to its lists
XmlCursor cursor = headerFooter.newCursor();
cursor.selectPath("./*");
while (cursor.toNextSelection()) {
XmlObject o = cursor.getObject();
if (o instanceof CTP) {
XWPFParagraph p = new XWPFParagraph((CTP) o, this);
paragraphs.add(p);
bodyElements.add(p);
}
if (o instanceof CTTbl) {
XWPFTable t = new XWPFTable((CTTbl) o, this);
tables.add(t);
bodyElements.add(t);
}
if (o instanceof CTSdtBlock) {
XWPFSDT c = new XWPFSDT((CTSdtBlock) o, this);
bodyElements.add(c);
}
}
cursor.dispose();
} catch (XmlException e) {
throw new POIXMLException(e);
} finally {
if (is != null) {
is.close();
}
}
}
use of org.apache.xmlbeans.XmlObject in project tika by apache.
the class XWPFWordExtractorDecorator method extractParagraph.
private void extractParagraph(XWPFParagraph paragraph, XWPFListManager listManager, XHTMLContentHandler xhtml) throws SAXException, XmlException, IOException {
// If this paragraph is actually a whole new section, then
// it could have its own headers and footers
// Check and handle if so
XWPFHeaderFooterPolicy headerFooterPolicy = null;
if (paragraph.getCTP().getPPr() != null) {
CTSectPr ctSectPr = paragraph.getCTP().getPPr().getSectPr();
if (ctSectPr != null) {
headerFooterPolicy = new XWPFHeaderFooterPolicy(document, ctSectPr);
extractHeaders(xhtml, headerFooterPolicy, listManager);
}
}
// Is this a paragraph, or a heading?
String tag = "p";
String styleClass = null;
//TIKA-2144 check that styles is not null
if (paragraph.getStyleID() != null && styles != null) {
XWPFStyle style = styles.getStyle(paragraph.getStyleID());
if (style != null && style.getName() != null) {
TagAndStyle tas = WordExtractor.buildParagraphTagAndStyle(style.getName(), paragraph.getPartType() == BodyType.TABLECELL);
tag = tas.getTag();
styleClass = tas.getStyleClass();
}
}
if (styleClass == null) {
xhtml.startElement(tag);
} else {
xhtml.startElement(tag, "class", styleClass);
}
writeParagraphNumber(paragraph, listManager, xhtml);
// TODO: replace w/ XPath/XQuery:
for (XWPFRun run : paragraph.getRuns()) {
XmlCursor c = run.getCTR().newCursor();
c.selectPath("./*");
while (c.toNextSelection()) {
XmlObject o = c.getObject();
if (o instanceof CTObject) {
XmlCursor c2 = o.newCursor();
c2.selectPath("./*");
while (c2.toNextSelection()) {
XmlObject o2 = c2.getObject();
XmlObject embedAtt = o2.selectAttribute(new QName("Type"));
if (embedAtt != null && embedAtt.getDomNode().getNodeValue().equals("Embed")) {
// Type is "Embed"
XmlObject relIDAtt = o2.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
if (relIDAtt != null) {
String relID = relIDAtt.getDomNode().getNodeValue();
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", relID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
}
}
}
c2.dispose();
}
}
c.dispose();
}
// we just put them in the correct paragraph)
for (int i = 0; i < paragraph.getCTP().sizeOfBookmarkStartArray(); i++) {
CTBookmark bookmark = paragraph.getCTP().getBookmarkStartArray(i);
xhtml.startElement("a", "name", bookmark.getName());
xhtml.endElement("a");
}
TmpFormatting fmtg = new TmpFormatting(false, false);
//hyperlinks may or may not have hyperlink ids
String lastHyperlinkId = null;
boolean inHyperlink = false;
// Do the iruns
for (IRunElement run : paragraph.getIRuns()) {
if (run instanceof XWPFHyperlinkRun) {
XWPFHyperlinkRun hyperlinkRun = (XWPFHyperlinkRun) run;
if (hyperlinkRun.getHyperlinkId() == null || !hyperlinkRun.getHyperlinkId().equals(lastHyperlinkId)) {
if (inHyperlink) {
//close out the old one
xhtml.endElement("a");
inHyperlink = false;
}
lastHyperlinkId = hyperlinkRun.getHyperlinkId();
fmtg = closeStyleTags(xhtml, fmtg);
XWPFHyperlink link = hyperlinkRun.getHyperlink(document);
if (link != null && link.getURL() != null) {
xhtml.startElement("a", "href", link.getURL());
inHyperlink = true;
} else if (hyperlinkRun.getAnchor() != null && hyperlinkRun.getAnchor().length() > 0) {
xhtml.startElement("a", "href", "#" + hyperlinkRun.getAnchor());
inHyperlink = true;
}
}
} else if (inHyperlink) {
//if this isn't a hyperlink, but the last one was
closeStyleTags(xhtml, fmtg);
xhtml.endElement("a");
lastHyperlinkId = null;
inHyperlink = false;
}
if (run instanceof XWPFSDT) {
fmtg = closeStyleTags(xhtml, fmtg);
processSDTRun((XWPFSDT) run, xhtml);
//for now, we're ignoring formatting in sdt
//if you hit an sdt reset to false
fmtg.setBold(false);
fmtg.setItalic(false);
} else {
fmtg = processRun((XWPFRun) run, paragraph, xhtml, fmtg);
}
}
closeStyleTags(xhtml, fmtg);
if (inHyperlink) {
xhtml.endElement("a");
}
// Now do any comments for the paragraph
XWPFCommentsDecorator comments = new XWPFCommentsDecorator(paragraph, null);
String commentText = comments.getCommentText();
if (commentText != null && commentText.length() > 0) {
xhtml.characters(commentText);
}
String footnameText = paragraph.getFootnoteText();
if (footnameText != null && footnameText.length() > 0) {
xhtml.characters(footnameText + "\n");
}
// Also extract any paragraphs embedded in text boxes:
if (config.getIncludeShapeBasedContent()) {
for (XmlObject embeddedParagraph : paragraph.getCTP().selectPath("declare namespace w='http://schemas.openxmlformats.org/wordprocessingml/2006/main' declare namespace wps='http://schemas.microsoft.com/office/word/2010/wordprocessingShape' .//*/wps:txbx/w:txbxContent/w:p")) {
extractParagraph(new XWPFParagraph(CTP.Factory.parse(embeddedParagraph.xmlText()), paragraph.getBody()), listManager, xhtml);
}
}
// Finish this paragraph
xhtml.endElement(tag);
if (headerFooterPolicy != null) {
extractFooters(xhtml, headerFooterPolicy, listManager);
}
}
use of org.apache.xmlbeans.XmlObject in project tika by apache.
the class XSLFPowerPointExtractorDecorator method extractContent.
private void extractContent(List<? extends XSLFShape> shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc) throws SAXException {
for (XSLFShape sh : shapes) {
if (sh instanceof XSLFTextShape) {
XSLFTextShape txt = (XSLFTextShape) sh;
Placeholder ph = txt.getTextType();
if (skipPlaceholders && ph != null) {
continue;
}
boolean inHyperlink = false;
for (XSLFTextParagraph p : txt.getTextParagraphs()) {
xhtml.startElement("p");
for (XSLFTextRun run : p.getTextRuns()) {
//TODO: add check for targetmode=external into POI
//then check to confirm that the urls are actually
//external and not footnote refs via the current hack
Hyperlink hyperlink = run.getHyperlink();
if (hyperlink != null && hyperlink.getAddress() != null && !hyperlink.getAddress().contains("#_ftn")) {
xhtml.startElement("a", "href", hyperlink.getAddress());
inHyperlink = true;
}
xhtml.characters(run.getRawText());
if (inHyperlink == true) {
xhtml.endElement("a");
}
inHyperlink = false;
}
xhtml.endElement("p");
}
} else if (sh instanceof XSLFGroupShape) {
// recurse into groups of shapes
XSLFGroupShape group = (XSLFGroupShape) sh;
extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc);
} else if (sh instanceof XSLFTable) {
//unlike tables in Word, ppt/x can't have recursive tables...I don't think
extractTable((XSLFTable) sh, xhtml);
} else if (sh instanceof XSLFGraphicFrame) {
XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
XmlObject[] sp = frame.getXmlObject().selectPath("declare namespace p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
if (sp != null) {
for (XmlObject emb : sp) {
XmlObject relIDAtt = emb.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
if (relIDAtt != null) {
String relID = relIDAtt.getDomNode().getNodeValue();
if (slideDesc != null) {
relID = slideDesc + relID;
}
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", relID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
}
}
}
} else if (sh instanceof XSLFPictureShape) {
if (!skipPlaceholders && (sh.getXmlObject() instanceof CTPicture)) {
CTPicture ctPic = ((CTPicture) sh.getXmlObject());
if (ctPic.getBlipFill() != null && ctPic.getBlipFill().getBlip() != null) {
String relID = ctPic.getBlipFill().getBlip().getEmbed();
if (relID != null) {
if (slideDesc != null) {
relID = slideDesc + relID;
}
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", relID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
}
}
}
}
}
}
Aggregations