Search in sources :

Example 1 with Node

use of org.htmlparser.Node in project jforum2 by rafaelsteil.

the class SafeHtml method makeSafe.

/**
 * Given an input, makes it safe for HTML displaying.
 * Removes any not allowed HTML tag or attribute, as well
 * unwanted Javascript statements inside the tags.
 * @param contents the input to analyze
 * @return the modified and safe string
 */
public String makeSafe(String contents) {
    if (contents == null || contents.length() == 0) {
        return contents;
    }
    StringBuffer sb = new StringBuffer(contents.length());
    try {
        Lexer lexer = new Lexer(contents);
        Node node;
        while ((node = lexer.nextNode()) != null) {
            boolean isTextNode = node instanceof TextNode;
            if (isTextNode) {
                // Text nodes are raw data, so we just
                // strip off all possible html content
                String text = node.toHtml();
                if (text.indexOf('>') > -1 || text.indexOf('<') > -1) {
                    StringBuffer tmp = new StringBuffer(text);
                    ViewCommon.replaceAll(tmp, "<", "&lt;");
                    ViewCommon.replaceAll(tmp, ">", "&gt;");
                    ViewCommon.replaceAll(tmp, "\"", "&quot;");
                    node.setText(tmp.toString());
                }
            }
            if (isTextNode || (node instanceof Tag && this.isTagWelcome(node))) {
                sb.append(node.toHtml());
            } else {
                StringBuffer tmp = new StringBuffer(node.toHtml());
                ViewCommon.replaceAll(tmp, "<", "&lt;");
                ViewCommon.replaceAll(tmp, ">", "&gt;");
                sb.append(tmp.toString());
            }
        }
    } catch (Exception e) {
        throw new ForumException("Error while parsing HTML: " + e, e);
    }
    return sb.toString();
}
Also used : Lexer(org.htmlparser.lexer.Lexer) ForumException(net.jforum.exceptions.ForumException) Node(org.htmlparser.Node) TextNode(org.htmlparser.nodes.TextNode) TextNode(org.htmlparser.nodes.TextNode) Tag(org.htmlparser.Tag) ForumException(net.jforum.exceptions.ForumException)

Example 2 with Node

use of org.htmlparser.Node in project dhis2-core by dhis2.

the class GridUtils method getColumnCount.

/**
 * Returns the number of columns/cells in the given row, including cell
 * spacing.
 */
private static int getColumnCount(TableRow row) {
    Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray();
    int cols = 0;
    for (Node cell : cells) {
        Integer colSpan = MathUtils.parseInt(((TagNode) cell).getAttribute("colspan"));
        cols += colSpan != null ? colSpan : 1;
    }
    return cols;
}
Also used : TagNode(org.htmlparser.nodes.TagNode) Node(org.htmlparser.Node) JasperPrint(net.sf.jasperreports.engine.JasperPrint)

Example 3 with Node

use of org.htmlparser.Node in project jforum2 by rafaelsteil.

the class SafeHtml method ensureAllAttributesAreSafe.

/**
 * Given an input, analyze each HTML tag and remove unsecure attributes from them.
 * @param contents The content to verify
 * @return the content, secure.
 */
public String ensureAllAttributesAreSafe(String contents) {
    StringBuffer sb = new StringBuffer(contents.length());
    try {
        Lexer lexer = new Lexer(contents);
        Node node;
        while ((node = lexer.nextNode()) != null) {
            if (node instanceof Tag) {
                Tag tag = (Tag) node;
                this.checkAndValidateAttributes(tag, false);
                sb.append(tag.toHtml());
            } else {
                sb.append(node.toHtml());
            }
        }
    } catch (Exception e) {
        throw new ForumException("Problems while parsing HTML: " + e, e);
    }
    return sb.toString();
}
Also used : Lexer(org.htmlparser.lexer.Lexer) ForumException(net.jforum.exceptions.ForumException) Node(org.htmlparser.Node) TextNode(org.htmlparser.nodes.TextNode) Tag(org.htmlparser.Tag) ForumException(net.jforum.exceptions.ForumException)

Example 4 with Node

use of org.htmlparser.Node in project omegat by omegat-org.

the class FilterVisitor method endup.

/**
 * Ends the segment collection and sends the translatable text out to OmegaT
 * core, and some extra tags to writer.
 */
protected void endup() {
    // detecting the first starting tag in 'befors'
    // that has its ending in the paragraph
    // all before this "first good" are simply written out
    List<Node> all = new ArrayList<Node>();
    all.addAll(befors);
    all.addAll(translatable);
    int firstgoodlimit = befors.size();
    int firstgood = 0;
    while (firstgood < firstgoodlimit) {
        Node goodNode = all.get(firstgood);
        if (!(goodNode instanceof Tag)) {
            firstgood++;
            continue;
        }
        Tag good = (Tag) goodNode;
        // trying to test
        int recursion = 1;
        boolean found = false;
        for (int i = firstgood + 1; i < all.size(); i++) {
            Node candNode = all.get(i);
            if (candNode instanceof Tag) {
                Tag cand = (Tag) candNode;
                if (cand.getTagName().equals(good.getTagName())) {
                    if (!cand.isEndTag()) {
                        recursion++;
                    } else {
                        recursion--;
                        if (recursion == 0) {
                            if (i >= firstgoodlimit) {
                                found = true;
                            }
                            // we've found an ending tag for this "good one"
                            break;
                        }
                    }
                }
            }
        }
        // this is a "good one"
        if (found) {
            break;
        }
        firstgood++;
    }
    // detecting the last ending tag in 'afters'
    // that has its starting in the paragraph
    // all after this "last good" is simply writen out
    int lastgoodlimit = all.size() - 1;
    all.addAll(afters);
    int lastgood = all.size() - 1;
    while (lastgood > lastgoodlimit) {
        Node goodNode = all.get(lastgood);
        if (!(goodNode instanceof Tag)) {
            lastgood--;
            continue;
        }
        Tag good = (Tag) goodNode;
        // trying to test
        int recursion = 1;
        boolean found = false;
        for (int i = lastgood - 1; i >= firstgoodlimit; i--) {
            Node candNode = all.get(i);
            if (candNode instanceof Tag) {
                Tag cand = (Tag) candNode;
                if (cand.getTagName().equals(good.getTagName())) {
                    if (cand.isEndTag()) {
                        recursion++;
                    } else {
                        recursion--;
                        if (recursion == 0) {
                            if (i <= lastgoodlimit) {
                                found = true;
                            }
                            // "good one"
                            break;
                        }
                    }
                }
            }
        }
        // this is a "good one"
        if (found) {
            break;
        }
        lastgood--;
    }
    boolean changed = true;
    while (changed) {
        changed = false;
        boolean removeTags = Core.getFilterMaster().getConfig().isRemoveTags();
        if (!removeTags) {
            for (int i = 0; i < firstgood; i++) {
                Node node = all.get(i);
                if (node instanceof Tag) {
                    firstgood = i;
                    changed = true;
                    break;
                }
            }
            for (int i = all.size() - 1; i > lastgood; i--) {
                Node node = all.get(i);
                if (node instanceof Tag) {
                    lastgood = i;
                    changed = true;
                    break;
                }
            }
        }
        boolean removeSpacesAround = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg();
        if (!removeSpacesAround) {
            for (int i = 0; i < firstgood; i++) {
                Node node = all.get(i);
                if (node instanceof TextNode) {
                    firstgood = i;
                    changed = true;
                    break;
                }
            }
            for (int i = all.size() - 1; i > lastgood; i--) {
                Node node = all.get(i);
                if (node instanceof TextNode) {
                    lastgood = i;
                    changed = true;
                    break;
                }
            }
        }
    }
    // writing out all tags before the "first good" one
    for (int i = 0; i < firstgood; i++) {
        Node node = all.get(i);
        if (node instanceof Tag) {
            writeout("<" + node.getText() + ">");
        } else {
            writeout(compressWhitespace(node.getText()));
        }
    }
    // appending all tags until "last good" one to paragraph text
    StringBuilder paragraph = new StringBuilder();
    // appending all tags starting from "first good" one to paragraph text
    for (int i = firstgood; i <= lastgood; i++) {
        Node node = all.get(i);
        if (node instanceof Tag) {
            shortcut((Tag) node, paragraph);
        } else {
            // node instanceof Text
            paragraph.append(HTMLUtils.entitiesToChars(node.toHtml()));
        }
    }
    String uncompressed = paragraph.toString();
    String compressed = uncompressed;
    String spacePrefix = "";
    String spacePostfix = "";
    int size = uncompressed.length();
    // (This changes the layout, therefore it is an option)
    if (!preformatting) {
        for (int cp, i = 0; i < size; i += Character.charCount(cp)) {
            cp = uncompressed.codePointAt(i);
            if (!Character.isWhitespace(cp)) {
                spacePrefix = i == 0 ? "" : uncompressed.substring(0, options.getCompressWhitespace() ? Math.min(i, uncompressed.offsetByCodePoints(i, 1)) : i);
                break;
            }
        }
        for (int cp, i = size; i > 0; i -= Character.charCount(cp)) {
            cp = uncompressed.codePointBefore(i);
            if (!Character.isWhitespace(cp)) {
                spacePostfix = i == size ? "" : uncompressed.substring(i, options.getCompressWhitespace() ? Math.min(uncompressed.offsetByCodePoints(i, 1), size) : size);
                break;
            }
        }
        if (Core.getFilterMaster().getConfig().isRemoveSpacesNonseg()) {
            compressed = StringUtil.compressSpaces(uncompressed);
        } else {
            compressed = uncompressed;
        }
    }
    // getting the translation
    String translation = filter.privateProcessEntry(compressed, null);
    // writing out uncompressed
    if (compressed.equals(translation) && !options.getCompressWhitespace()) {
        translation = uncompressed;
    }
    // converting & < and > into &amp; &lt; and &gt; respectively
    // note that this doesn't change < and > of tag shortcuts
    translation = HTMLUtils.charsToEntities(translation, filter.getTargetEncoding(), sShortcuts);
    // expands tag shortcuts into full-blown tags
    translation = unshorcutize(translation);
    // writing out the paragraph into target file
    writeout(spacePrefix);
    writeout(translation);
    writeout(spacePostfix);
    // writing out all tags after the "last good" one
    for (int i = lastgood + 1; i < all.size(); i++) {
        Node node = all.get(i);
        if (node instanceof Tag) {
            writeout("<" + node.getText() + ">");
        } else {
            writeout(compressWhitespace(node.getText()));
        }
    }
    cleanup();
}
Also used : Node(org.htmlparser.Node) TextNode(org.htmlparser.nodes.TextNode) ArrayList(java.util.ArrayList) TextNode(org.htmlparser.nodes.TextNode) Tag(org.htmlparser.Tag)

Example 5 with Node

use of org.htmlparser.Node in project laogewen by wen4034.

the class HtmlParserTool method extracLinks.

public static Set<String> extracLinks(String url, LinkFilter filter, String... validate) {
    Set<String> links = Sets.newHashSet();
    try {
        URL realurl = new URL(url);
        URLConnection connection = realurl.openConnection();
        connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
        connection.setReadTimeout(100000);
        connection.setConnectTimeout(100000);
        Parser parser = new Parser(connection);
        parser.setEncoding("UTF-8");
        // 过滤<frame>标签的filter,用来提取frame标签的src属性
        NodeFilter frameFilter = new NodeFilter() {

            @Override
            public boolean accept(Node node) {
                if (node.getText().startsWith("frame src=")) {
                    return true;
                } else {
                    return false;
                }
            }
        };
        OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
        NodeList list = parser.extractAllNodesThatMatch(linkFilter);
        for (int i = 0; i < list.size(); i++) {
            Node tag = list.elementAt(i);
            if (tag instanceof LinkTag) {
                LinkTag linkTag = (LinkTag) tag;
                String linkurl = linkTag.getLink();
                if (filter.accept(linkurl, validate)) {
                    links.add(linkurl);
                }
            } else {
                String fram = tag.getText();
                int start = fram.indexOf("src=");
                fram = fram.substring(start);
                int end = fram.indexOf(" ");
                if (end == -1) {
                    end = fram.indexOf(">");
                }
                String frameUrl = fram.substring(5, end - 1);
                if (filter.accept(frameUrl, validate)) {
                    links.add(frameUrl);
                }
            }
        }
    } catch (Exception e) {
        System.out.println(url + "链接失败");
        e.printStackTrace();
    }
    return links;
}
Also used : NodeClassFilter(org.htmlparser.filters.NodeClassFilter) LinkTag(org.htmlparser.tags.LinkTag) Node(org.htmlparser.Node) NodeList(org.htmlparser.util.NodeList) OrFilter(org.htmlparser.filters.OrFilter) URL(java.net.URL) URLConnection(java.net.URLConnection) Parser(org.htmlparser.Parser) NodeFilter(org.htmlparser.NodeFilter)

Aggregations

Node (org.htmlparser.Node)6 Tag (org.htmlparser.Tag)3 TextNode (org.htmlparser.nodes.TextNode)3 ArrayList (java.util.ArrayList)2 ForumException (net.jforum.exceptions.ForumException)2 Parser (org.htmlparser.Parser)2 Lexer (org.htmlparser.lexer.Lexer)2 TagNode (org.htmlparser.nodes.TagNode)2 URL (java.net.URL)1 URLConnection (java.net.URLConnection)1 JasperPrint (net.sf.jasperreports.engine.JasperPrint)1 Grid (org.hisp.dhis.common.Grid)1 GridHeader (org.hisp.dhis.common.GridHeader)1 NodeFilter (org.htmlparser.NodeFilter)1 NodeClassFilter (org.htmlparser.filters.NodeClassFilter)1 OrFilter (org.htmlparser.filters.OrFilter)1 TagNameFilter (org.htmlparser.filters.TagNameFilter)1 LinkTag (org.htmlparser.tags.LinkTag)1 TableRow (org.htmlparser.tags.TableRow)1 TableTag (org.htmlparser.tags.TableTag)1