Search in sources :

Example 6 with NodeWalker

use of org.apache.nutch.util.NodeWalker in project nutch by apache.

the class DOMContentUtils method getOutlinks.

/**
 * This method finds all anchors below the supplied DOM <code>node</code>, and
 * creates appropriate {@link Outlink} records for each (relative to the
 * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
 * {@link ArrayList}.
 *
 * <p>
 *
 * Links without inner structure (tags, text, etc) are discarded, as are links
 * which contain only single nested links and empty text nodes (this is a
 * common DOM-fixup artifact, at least with nekohtml).
 */
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {
        Node currentNode = walker.nextNode();
        String nodeName = currentNode.getNodeName();
        short nodeType = currentNode.getNodeType();
        NodeList children = currentNode.getChildNodes();
        int childLen = (children != null) ? children.getLength() : 0;
        if (nodeType == Node.ELEMENT_NODE) {
            nodeName = nodeName.toLowerCase();
            LinkParams params = (LinkParams) linkParams.get(nodeName);
            if (params != null) {
                if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
                    StringBuffer linkText = new StringBuffer();
                    getText(linkText, currentNode, true);
                    if (linkText.toString().trim().length() == 0) {
                        // try harder - use img alt if present
                        NodeWalker subWalker = new NodeWalker(currentNode);
                        while (subWalker.hasNext()) {
                            Node subNode = subWalker.nextNode();
                            if (subNode.getNodeType() == Node.ELEMENT_NODE) {
                                if (subNode.getNodeName().toLowerCase().equals("img")) {
                                    NamedNodeMap subAttrs = subNode.getAttributes();
                                    Node alt = subAttrs.getNamedItem("alt");
                                    if (alt != null) {
                                        String altTxt = alt.getTextContent();
                                        if (altTxt != null && altTxt.trim().length() > 0) {
                                            if (linkText.length() > 0)
                                                linkText.append(' ');
                                            linkText.append(altTxt);
                                        }
                                    }
                                } else {
                                // ignore other types of elements
                                }
                            } else if (subNode.getNodeType() == Node.TEXT_NODE) {
                                String txt = subNode.getTextContent();
                                if (txt != null && txt.length() > 0) {
                                    if (linkText.length() > 0)
                                        linkText.append(' ');
                                    linkText.append(txt);
                                }
                            }
                        }
                    }
                    NamedNodeMap attrs = currentNode.getAttributes();
                    String target = null;
                    boolean noFollow = false;
                    boolean post = false;
                    for (int i = 0; i < attrs.getLength(); i++) {
                        Node attr = attrs.item(i);
                        String attrName = attr.getNodeName();
                        if (params.attrName.equalsIgnoreCase(attrName)) {
                            target = attr.getNodeValue();
                        } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
                            noFollow = true;
                        } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getNodeValue())) {
                            post = true;
                        }
                    }
                    if (target != null && !noFollow && !post)
                        try {
                            URL url = URLUtil.resolveURL(base, target);
                            Outlink outlink = new Outlink(url.toString(), linkText.toString().trim());
                            outlinks.add(outlink);
                            // the outlink metadata
                            if (keepNodenames) {
                                MapWritable metadata = new MapWritable();
                                metadata.put(new Text(srcTagMetaName), new Text(nodeName));
                                outlink.setMetadata(metadata);
                            }
                        } catch (MalformedURLException e) {
                        // don't care
                        }
                }
                // this should not have any children, skip them
                if (params.childLen == 0)
                    continue;
            }
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) MalformedURLException(java.net.MalformedURLException) NamedNodeMap(org.w3c.dom.NamedNodeMap) Node(org.w3c.dom.Node) NodeList(org.w3c.dom.NodeList) Text(org.apache.hadoop.io.Text) MapWritable(org.apache.hadoop.io.MapWritable) NodeWalker(org.apache.nutch.util.NodeWalker) URL(java.net.URL)

Example 7 with NodeWalker

use of org.apache.nutch.util.NodeWalker in project nutch by apache.

the class DOMContentUtils method getTitle.

/**
 * This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
 * append the content text found beneath the first <code>title</code> node to
 * the <code>StringBuffer</code>.
 *
 * @return true if a title node was found, false otherwise
 */
public boolean getTitle(StringBuffer sb, Node node) {
    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {
        Node currentNode = walker.nextNode();
        String nodeName = currentNode.getNodeName();
        short nodeType = currentNode.getNodeType();
        if ("body".equalsIgnoreCase(nodeName)) {
            // stop after HEAD
            return false;
        }
        if (nodeType == Node.ELEMENT_NODE) {
            if ("title".equalsIgnoreCase(nodeName)) {
                getText(sb, currentNode);
                return true;
            }
        }
    }
    return false;
}
Also used : Node(org.w3c.dom.Node) NodeWalker(org.apache.nutch.util.NodeWalker)

Example 8 with NodeWalker

use of org.apache.nutch.util.NodeWalker in project nutch by apache.

the class DOMContentUtils method getBase.

/**
 * If Node contains a BASE tag then it's HREF is returned.
 */
public String getBase(Node node) {
    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {
        Node currentNode = walker.nextNode();
        String nodeName = currentNode.getNodeName();
        short nodeType = currentNode.getNodeType();
        // is this node a BASE tag?
        if (nodeType == Node.ELEMENT_NODE) {
            if ("body".equalsIgnoreCase(nodeName)) {
                // stop after HEAD
                return null;
            }
            if ("base".equalsIgnoreCase(nodeName)) {
                NamedNodeMap attrs = currentNode.getAttributes();
                for (int i = 0; i < attrs.getLength(); i++) {
                    Node attr = attrs.item(i);
                    if ("href".equalsIgnoreCase(attr.getNodeName())) {
                        return attr.getNodeValue();
                    }
                }
            }
        }
    }
    // no.
    return null;
}
Also used : NamedNodeMap(org.w3c.dom.NamedNodeMap) Node(org.w3c.dom.Node) NodeWalker(org.apache.nutch.util.NodeWalker)

Example 9 with NodeWalker

use of org.apache.nutch.util.NodeWalker in project nutch by apache.

the class DOMContentUtils method getTextHelper.

// returns true if abortOnNestedAnchors is true and we find nested
// anchors
private boolean getTextHelper(StringBuffer sb, Node node, boolean abortOnNestedAnchors, int anchorDepth) {
    boolean abort = false;
    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {
        Node currentNode = walker.nextNode();
        String nodeName = currentNode.getNodeName();
        short nodeType = currentNode.getNodeType();
        if ("script".equalsIgnoreCase(nodeName)) {
            walker.skipChildren();
        }
        if ("style".equalsIgnoreCase(nodeName)) {
            walker.skipChildren();
        }
        if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
            anchorDepth++;
            if (anchorDepth > 1) {
                abort = true;
                break;
            }
        }
        if (nodeType == Node.COMMENT_NODE) {
            walker.skipChildren();
        }
        if (nodeType == Node.TEXT_NODE) {
            // cleanup and trim the value
            String text = currentNode.getNodeValue();
            text = text.replaceAll("\\s+", " ");
            text = text.trim();
            if (text.length() > 0) {
                appendSpace(sb);
                sb.append(text);
            } else {
                appendParagraphSeparator(sb);
            }
        }
    }
    return abort;
}
Also used : Node(org.w3c.dom.Node) NodeWalker(org.apache.nutch.util.NodeWalker)

Example 10 with NodeWalker

use of org.apache.nutch.util.NodeWalker in project nutch by apache.

the class DOMContentUtils method getOutlinks.

/**
 * This method finds all anchors below the supplied DOM <code>node</code>, and
 * creates appropriate {@link Outlink} records for each (relative to the
 * supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
 * {@link ArrayList}.
 *
 * <p>
 *
 * Links without inner structure (tags, text, etc) are discarded, as are links
 * which contain only single nested links and empty text nodes (this is a
 * common DOM-fixup artifact, at least with nekohtml).
 */
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
    NodeWalker walker = new NodeWalker(node);
    while (walker.hasNext()) {
        Node currentNode = walker.nextNode();
        String nodeName = currentNode.getNodeName();
        short nodeType = currentNode.getNodeType();
        NodeList children = currentNode.getChildNodes();
        int childLen = (children != null) ? children.getLength() : 0;
        if (nodeType == Node.ELEMENT_NODE) {
            nodeName = nodeName.toLowerCase();
            LinkParams params = (LinkParams) linkParams.get(nodeName);
            if (params != null) {
                if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
                    StringBuffer linkText = new StringBuffer();
                    getText(linkText, currentNode, true);
                    NamedNodeMap attrs = currentNode.getAttributes();
                    String target = null;
                    boolean noFollow = false;
                    boolean post = false;
                    for (int i = 0; i < attrs.getLength(); i++) {
                        Node attr = attrs.item(i);
                        String attrName = attr.getNodeName();
                        if (params.attrName.equalsIgnoreCase(attrName)) {
                            target = attr.getNodeValue();
                        } else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
                            noFollow = true;
                        } else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getNodeValue())) {
                            post = true;
                        }
                    }
                    if (target != null && !noFollow && !post)
                        try {
                            URL url = URLUtil.resolveURL(base, target);
                            Outlink outlink = new Outlink(url.toString(), linkText.toString().trim());
                            outlinks.add(outlink);
                            // the outlink metadata
                            if (keepNodenames) {
                                MapWritable metadata = new MapWritable();
                                metadata.put(new Text(srcTagMetaName), new Text(nodeName));
                                outlink.setMetadata(metadata);
                            }
                        } catch (MalformedURLException e) {
                        // don't care
                        }
                }
                // this should not have any children, skip them
                if (params.childLen == 0)
                    continue;
            }
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) MalformedURLException(java.net.MalformedURLException) NamedNodeMap(org.w3c.dom.NamedNodeMap) Node(org.w3c.dom.Node) NodeList(org.w3c.dom.NodeList) Text(org.apache.hadoop.io.Text) MapWritable(org.apache.hadoop.io.MapWritable) NodeWalker(org.apache.nutch.util.NodeWalker) URL(java.net.URL)

Aggregations

NodeWalker (org.apache.nutch.util.NodeWalker)10 Node (org.w3c.dom.Node)10 NamedNodeMap (org.w3c.dom.NamedNodeMap)4 MalformedURLException (java.net.MalformedURLException)2 URL (java.net.URL)2 MapWritable (org.apache.hadoop.io.MapWritable)2 Text (org.apache.hadoop.io.Text)2 Outlink (org.apache.nutch.parse.Outlink)2 NodeList (org.w3c.dom.NodeList)2 ArrayList (java.util.ArrayList)1 Matcher (java.util.regex.Matcher)1