use of org.apache.poi.common.usermodel.Hyperlink in project tika by apache.
the class HSLFExtractor method textRunsToText.
private void textRunsToText(XHTMLContentHandler xhtml, List<List<HSLFTextParagraph>> paragraphsList) throws SAXException {
if (paragraphsList == null) {
return;
}
for (List<HSLFTextParagraph> run : paragraphsList) {
// Leaving in wisdom from TIKA-712 for easy revert.
// Avoid boiler-plate text on the master slide (0
// = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
//if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
boolean isBullet = false;
for (HSLFTextParagraph htp : run) {
boolean nextBullet = htp.isBullet();
// TODO: identify bullet/list type
if (isBullet != nextBullet) {
isBullet = nextBullet;
if (isBullet) {
xhtml.startElement("ul");
} else {
xhtml.endElement("ul");
}
}
List<HSLFTextRun> textRuns = htp.getTextRuns();
String firstLine = removePBreak(textRuns.get(0).getRawText());
boolean showBullet = (isBullet && (textRuns.size() > 1 || !"".equals(firstLine)));
String paraTag = showBullet ? "li" : "p";
xhtml.startElement(paraTag);
boolean runIsHyperLink = false;
for (HSLFTextRun htr : textRuns) {
Hyperlink link = htr.getHyperlink();
if (link != null) {
String address = link.getAddress();
if (address != null && !address.startsWith("_ftn")) {
xhtml.startElement("a", "href", link.getAddress());
runIsHyperLink = true;
}
}
String line = htr.getRawText();
if (line != null) {
boolean isfirst = true;
for (String fragment : line.split("\\u000b")) {
if (!isfirst) {
xhtml.startElement("br");
xhtml.endElement("br");
}
isfirst = false;
xhtml.characters(removePBreak(fragment));
}
if (line.endsWith("")) {
xhtml.startElement("br");
xhtml.endElement("br");
}
}
if (runIsHyperLink) {
xhtml.endElement("a");
}
runIsHyperLink = false;
}
xhtml.endElement(paraTag);
}
if (isBullet) {
xhtml.endElement("ul");
}
}
}
use of org.apache.poi.common.usermodel.Hyperlink in project OpenRefine by OpenRefine.
the class ExcelImporter method extractCell.
protected static Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map<String, Recon> reconMap) {
Serializable value = extractCell(cell);
if (value != null) {
Recon recon = null;
Hyperlink hyperlink = cell.getHyperlink();
if (hyperlink != null) {
String url = hyperlink.getAddress();
if (url != null && (url.startsWith("http://") || url.startsWith("https://"))) {
final String sig = "freebase.com/view";
int i = url.indexOf(sig);
if (i > 0) {
String id = url.substring(i + sig.length());
int q = id.indexOf('?');
if (q > 0) {
id = id.substring(0, q);
}
int h = id.indexOf('#');
if (h > 0) {
id = id.substring(0, h);
}
if (reconMap.containsKey(id)) {
recon = reconMap.get(id);
recon.judgmentBatchSize++;
} else {
recon = new Recon(0, null, null);
recon.service = "import";
recon.match = new ReconCandidate(id, value.toString(), new String[0], 100);
recon.matchRank = 0;
recon.judgment = Judgment.Matched;
recon.judgmentAction = "auto";
recon.judgmentBatchSize = 1;
recon.addCandidate(recon.match);
reconMap.put(id, recon);
}
}
}
}
return new Cell(value, recon);
} else {
return null;
}
}
use of org.apache.poi.common.usermodel.Hyperlink in project tika by apache.
the class XSLFPowerPointExtractorDecorator method extractContent.
private void extractContent(List<? extends XSLFShape> shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc) throws SAXException {
for (XSLFShape sh : shapes) {
if (sh instanceof XSLFTextShape) {
XSLFTextShape txt = (XSLFTextShape) sh;
Placeholder ph = txt.getTextType();
if (skipPlaceholders && ph != null) {
continue;
}
boolean inHyperlink = false;
for (XSLFTextParagraph p : txt.getTextParagraphs()) {
xhtml.startElement("p");
for (XSLFTextRun run : p.getTextRuns()) {
//TODO: add check for targetmode=external into POI
//then check to confirm that the urls are actually
//external and not footnote refs via the current hack
Hyperlink hyperlink = run.getHyperlink();
if (hyperlink != null && hyperlink.getAddress() != null && !hyperlink.getAddress().contains("#_ftn")) {
xhtml.startElement("a", "href", hyperlink.getAddress());
inHyperlink = true;
}
xhtml.characters(run.getRawText());
if (inHyperlink == true) {
xhtml.endElement("a");
}
inHyperlink = false;
}
xhtml.endElement("p");
}
} else if (sh instanceof XSLFGroupShape) {
// recurse into groups of shapes
XSLFGroupShape group = (XSLFGroupShape) sh;
extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc);
} else if (sh instanceof XSLFTable) {
//unlike tables in Word, ppt/x can't have recursive tables...I don't think
extractTable((XSLFTable) sh, xhtml);
} else if (sh instanceof XSLFGraphicFrame) {
XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
XmlObject[] sp = frame.getXmlObject().selectPath("declare namespace p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
if (sp != null) {
for (XmlObject emb : sp) {
XmlObject relIDAtt = emb.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
if (relIDAtt != null) {
String relID = relIDAtt.getDomNode().getNodeValue();
if (slideDesc != null) {
relID = slideDesc + relID;
}
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", relID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
}
}
}
} else if (sh instanceof XSLFPictureShape) {
if (!skipPlaceholders && (sh.getXmlObject() instanceof CTPicture)) {
CTPicture ctPic = ((CTPicture) sh.getXmlObject());
if (ctPic.getBlipFill() != null && ctPic.getBlipFill().getBlip() != null) {
String relID = ctPic.getBlipFill().getBlip().getEmbed();
if (relID != null) {
if (slideDesc != null) {
relID = slideDesc + relID;
}
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", relID);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
}
}
}
}
}
}
Aggregations