Search in sources :

Example 1 with Hyperlink

use of org.apache.poi.common.usermodel.Hyperlink in project tika by apache.

the class HSLFExtractor method textRunsToText.

private void textRunsToText(XHTMLContentHandler xhtml, List<List<HSLFTextParagraph>> paragraphsList) throws SAXException {
    if (paragraphsList == null) {
        return;
    }
    for (List<HSLFTextParagraph> run : paragraphsList) {
        // Leaving in wisdom from TIKA-712 for easy revert.
        // Avoid boiler-plate text on the master slide (0
        // = TextHeaderAtom.TITLE_TYPE, 1 = TextHeaderAtom.BODY_TYPE):
        //if (!isMaster || (run.getRunType() != 0 && run.getRunType() != 1)) {
        boolean isBullet = false;
        for (HSLFTextParagraph htp : run) {
            boolean nextBullet = htp.isBullet();
            // TODO: identify bullet/list type
            if (isBullet != nextBullet) {
                isBullet = nextBullet;
                if (isBullet) {
                    xhtml.startElement("ul");
                } else {
                    xhtml.endElement("ul");
                }
            }
            List<HSLFTextRun> textRuns = htp.getTextRuns();
            String firstLine = removePBreak(textRuns.get(0).getRawText());
            boolean showBullet = (isBullet && (textRuns.size() > 1 || !"".equals(firstLine)));
            String paraTag = showBullet ? "li" : "p";
            xhtml.startElement(paraTag);
            boolean runIsHyperLink = false;
            for (HSLFTextRun htr : textRuns) {
                Hyperlink link = htr.getHyperlink();
                if (link != null) {
                    String address = link.getAddress();
                    if (address != null && !address.startsWith("_ftn")) {
                        xhtml.startElement("a", "href", link.getAddress());
                        runIsHyperLink = true;
                    }
                }
                String line = htr.getRawText();
                if (line != null) {
                    boolean isfirst = true;
                    for (String fragment : line.split("\\u000b")) {
                        if (!isfirst) {
                            xhtml.startElement("br");
                            xhtml.endElement("br");
                        }
                        isfirst = false;
                        xhtml.characters(removePBreak(fragment));
                    }
                    if (line.endsWith("")) {
                        xhtml.startElement("br");
                        xhtml.endElement("br");
                    }
                }
                if (runIsHyperLink) {
                    xhtml.endElement("a");
                }
                runIsHyperLink = false;
            }
            xhtml.endElement(paraTag);
        }
        if (isBullet) {
            xhtml.endElement("ul");
        }
    }
}
Also used : HSLFTextRun(org.apache.poi.hslf.usermodel.HSLFTextRun) HSLFTextParagraph(org.apache.poi.hslf.usermodel.HSLFTextParagraph) Hyperlink(org.apache.poi.common.usermodel.Hyperlink)

Example 2 with Hyperlink

use of org.apache.poi.common.usermodel.Hyperlink in project OpenRefine by OpenRefine.

the class ExcelImporter method extractCell.

protected static Cell extractCell(org.apache.poi.ss.usermodel.Cell cell, Map<String, Recon> reconMap) {
    Serializable value = extractCell(cell);
    if (value != null) {
        Recon recon = null;
        Hyperlink hyperlink = cell.getHyperlink();
        if (hyperlink != null) {
            String url = hyperlink.getAddress();
            if (url != null && (url.startsWith("http://") || url.startsWith("https://"))) {
                final String sig = "freebase.com/view";
                int i = url.indexOf(sig);
                if (i > 0) {
                    String id = url.substring(i + sig.length());
                    int q = id.indexOf('?');
                    if (q > 0) {
                        id = id.substring(0, q);
                    }
                    int h = id.indexOf('#');
                    if (h > 0) {
                        id = id.substring(0, h);
                    }
                    if (reconMap.containsKey(id)) {
                        recon = reconMap.get(id);
                        recon.judgmentBatchSize++;
                    } else {
                        recon = new Recon(0, null, null);
                        recon.service = "import";
                        recon.match = new ReconCandidate(id, value.toString(), new String[0], 100);
                        recon.matchRank = 0;
                        recon.judgment = Judgment.Matched;
                        recon.judgmentAction = "auto";
                        recon.judgmentBatchSize = 1;
                        recon.addCandidate(recon.match);
                        reconMap.put(id, recon);
                    }
                }
            }
        }
        return new Cell(value, recon);
    } else {
        return null;
    }
}
Also used : Serializable(java.io.Serializable) Recon(com.google.refine.model.Recon) Cell(com.google.refine.model.Cell) ReconCandidate(com.google.refine.model.ReconCandidate) Hyperlink(org.apache.poi.common.usermodel.Hyperlink)

Example 3 with Hyperlink

use of org.apache.poi.common.usermodel.Hyperlink in project tika by apache.

the class XSLFPowerPointExtractorDecorator method extractContent.

private void extractContent(List<? extends XSLFShape> shapes, boolean skipPlaceholders, XHTMLContentHandler xhtml, String slideDesc) throws SAXException {
    for (XSLFShape sh : shapes) {
        if (sh instanceof XSLFTextShape) {
            XSLFTextShape txt = (XSLFTextShape) sh;
            Placeholder ph = txt.getTextType();
            if (skipPlaceholders && ph != null) {
                continue;
            }
            boolean inHyperlink = false;
            for (XSLFTextParagraph p : txt.getTextParagraphs()) {
                xhtml.startElement("p");
                for (XSLFTextRun run : p.getTextRuns()) {
                    //TODO: add check for targetmode=external into POI
                    //then check to confirm that the urls are actually
                    //external and not footnote refs via the current hack
                    Hyperlink hyperlink = run.getHyperlink();
                    if (hyperlink != null && hyperlink.getAddress() != null && !hyperlink.getAddress().contains("#_ftn")) {
                        xhtml.startElement("a", "href", hyperlink.getAddress());
                        inHyperlink = true;
                    }
                    xhtml.characters(run.getRawText());
                    if (inHyperlink == true) {
                        xhtml.endElement("a");
                    }
                    inHyperlink = false;
                }
                xhtml.endElement("p");
            }
        } else if (sh instanceof XSLFGroupShape) {
            // recurse into groups of shapes
            XSLFGroupShape group = (XSLFGroupShape) sh;
            extractContent(group.getShapes(), skipPlaceholders, xhtml, slideDesc);
        } else if (sh instanceof XSLFTable) {
            //unlike tables in Word, ppt/x can't have recursive tables...I don't think
            extractTable((XSLFTable) sh, xhtml);
        } else if (sh instanceof XSLFGraphicFrame) {
            XSLFGraphicFrame frame = (XSLFGraphicFrame) sh;
            XmlObject[] sp = frame.getXmlObject().selectPath("declare namespace p='http://schemas.openxmlformats.org/presentationml/2006/main' .//*/p:oleObj");
            if (sp != null) {
                for (XmlObject emb : sp) {
                    XmlObject relIDAtt = emb.selectAttribute(new QName("http://schemas.openxmlformats.org/officeDocument/2006/relationships", "id"));
                    if (relIDAtt != null) {
                        String relID = relIDAtt.getDomNode().getNodeValue();
                        if (slideDesc != null) {
                            relID = slideDesc + relID;
                        }
                        AttributesImpl attributes = new AttributesImpl();
                        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                        attributes.addAttribute("", "id", "id", "CDATA", relID);
                        xhtml.startElement("div", attributes);
                        xhtml.endElement("div");
                    }
                }
            }
        } else if (sh instanceof XSLFPictureShape) {
            if (!skipPlaceholders && (sh.getXmlObject() instanceof CTPicture)) {
                CTPicture ctPic = ((CTPicture) sh.getXmlObject());
                if (ctPic.getBlipFill() != null && ctPic.getBlipFill().getBlip() != null) {
                    String relID = ctPic.getBlipFill().getBlip().getEmbed();
                    if (relID != null) {
                        if (slideDesc != null) {
                            relID = slideDesc + relID;
                        }
                        AttributesImpl attributes = new AttributesImpl();
                        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                        attributes.addAttribute("", "id", "id", "CDATA", relID);
                        xhtml.startElement("div", attributes);
                        xhtml.endElement("div");
                    }
                }
            }
        }
    }
}
Also used : Placeholder(org.apache.poi.sl.usermodel.Placeholder) QName(javax.xml.namespace.QName) AttributesImpl(org.xml.sax.helpers.AttributesImpl) CTPicture(org.openxmlformats.schemas.presentationml.x2006.main.CTPicture) XmlObject(org.apache.xmlbeans.XmlObject) Hyperlink(org.apache.poi.common.usermodel.Hyperlink)

Aggregations

Hyperlink (org.apache.poi.common.usermodel.Hyperlink)3 Cell (com.google.refine.model.Cell)1 Recon (com.google.refine.model.Recon)1 ReconCandidate (com.google.refine.model.ReconCandidate)1 Serializable (java.io.Serializable)1 QName (javax.xml.namespace.QName)1 HSLFTextParagraph (org.apache.poi.hslf.usermodel.HSLFTextParagraph)1 HSLFTextRun (org.apache.poi.hslf.usermodel.HSLFTextRun)1 Placeholder (org.apache.poi.sl.usermodel.Placeholder)1 XmlObject (org.apache.xmlbeans.XmlObject)1 CTPicture (org.openxmlformats.schemas.presentationml.x2006.main.CTPicture)1 AttributesImpl (org.xml.sax.helpers.AttributesImpl)1