use of org.apache.nutch.util.NodeWalker in project nutch by apache.
the class DOMContentUtils method getOutlinks.
/**
* This method finds all anchors below the supplied DOM <code>node</code>, and
* creates appropriate {@link Outlink} records for each (relative to the
* supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
* {@link ArrayList}.
*
* <p>
*
* Links without inner structure (tags, text, etc) are discarded, as are links
* which contain only single nested links and empty text nodes (this is a
* common DOM-fixup artifact, at least with nekohtml).
*/
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
NodeList children = currentNode.getChildNodes();
int childLen = (children != null) ? children.getLength() : 0;
if (nodeType == Node.ELEMENT_NODE) {
nodeName = nodeName.toLowerCase();
LinkParams params = (LinkParams) linkParams.get(nodeName);
if (params != null) {
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
StringBuffer linkText = new StringBuffer();
getText(linkText, currentNode, true);
if (linkText.toString().trim().length() == 0) {
// try harder - use img alt if present
NodeWalker subWalker = new NodeWalker(currentNode);
while (subWalker.hasNext()) {
Node subNode = subWalker.nextNode();
if (subNode.getNodeType() == Node.ELEMENT_NODE) {
if (subNode.getNodeName().toLowerCase().equals("img")) {
NamedNodeMap subAttrs = subNode.getAttributes();
Node alt = subAttrs.getNamedItem("alt");
if (alt != null) {
String altTxt = alt.getTextContent();
if (altTxt != null && altTxt.trim().length() > 0) {
if (linkText.length() > 0)
linkText.append(' ');
linkText.append(altTxt);
}
}
} else {
// ignore other types of elements
}
} else if (subNode.getNodeType() == Node.TEXT_NODE) {
String txt = subNode.getTextContent();
if (txt != null && txt.length() > 0) {
if (linkText.length() > 0)
linkText.append(' ');
linkText.append(txt);
}
}
}
}
NamedNodeMap attrs = currentNode.getAttributes();
String target = null;
boolean noFollow = false;
boolean post = false;
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
if (params.attrName.equalsIgnoreCase(attrName)) {
target = attr.getNodeValue();
} else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
noFollow = true;
} else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getNodeValue())) {
post = true;
}
}
if (target != null && !noFollow && !post)
try {
URL url = URLUtil.resolveURL(base, target);
Outlink outlink = new Outlink(url.toString(), linkText.toString().trim());
outlinks.add(outlink);
// the outlink metadata
if (keepNodenames) {
MapWritable metadata = new MapWritable();
metadata.put(new Text(srcTagMetaName), new Text(nodeName));
outlink.setMetadata(metadata);
}
} catch (MalformedURLException e) {
// don't care
}
}
// this should not have any children, skip them
if (params.childLen == 0)
continue;
}
}
}
}
use of org.apache.nutch.util.NodeWalker in project nutch by apache.
the class DOMContentUtils method getTitle.
/**
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
* append the content text found beneath the first <code>title</code> node to
* the <code>StringBuffer</code>.
*
* @return true if a title node was found, false otherwise
*/
public boolean getTitle(StringBuffer sb, Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
if ("body".equalsIgnoreCase(nodeName)) {
// stop after HEAD
return false;
}
if (nodeType == Node.ELEMENT_NODE) {
if ("title".equalsIgnoreCase(nodeName)) {
getText(sb, currentNode);
return true;
}
}
}
return false;
}
use of org.apache.nutch.util.NodeWalker in project nutch by apache.
the class DOMContentUtils method getBase.
/**
* If Node contains a BASE tag then it's HREF is returned.
*/
public String getBase(Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
// is this node a BASE tag?
if (nodeType == Node.ELEMENT_NODE) {
if ("body".equalsIgnoreCase(nodeName)) {
// stop after HEAD
return null;
}
if ("base".equalsIgnoreCase(nodeName)) {
NamedNodeMap attrs = currentNode.getAttributes();
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
if ("href".equalsIgnoreCase(attr.getNodeName())) {
return attr.getNodeValue();
}
}
}
}
}
// no.
return null;
}
use of org.apache.nutch.util.NodeWalker in project nutch by apache.
the class DOMContentUtils method getTextHelper.
// returns true if abortOnNestedAnchors is true and we find nested
// anchors
private boolean getTextHelper(StringBuffer sb, Node node, boolean abortOnNestedAnchors, int anchorDepth) {
boolean abort = false;
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
if ("script".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
if ("style".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
anchorDepth++;
if (anchorDepth > 1) {
abort = true;
break;
}
}
if (nodeType == Node.COMMENT_NODE) {
walker.skipChildren();
}
if (nodeType == Node.TEXT_NODE) {
// cleanup and trim the value
String text = currentNode.getNodeValue();
text = text.replaceAll("\\s+", " ");
text = text.trim();
if (text.length() > 0) {
appendSpace(sb);
sb.append(text);
} else {
appendParagraphSeparator(sb);
}
}
}
return abort;
}
use of org.apache.nutch.util.NodeWalker in project nutch by apache.
the class DOMContentUtils method getOutlinks.
/**
* This method finds all anchors below the supplied DOM <code>node</code>, and
* creates appropriate {@link Outlink} records for each (relative to the
* supplied <code>base</code> URL), and adds them to the <code>outlinks</code>
* {@link ArrayList}.
*
* <p>
*
* Links without inner structure (tags, text, etc) are discarded, as are links
* which contain only single nested links and empty text nodes (this is a
* common DOM-fixup artifact, at least with nekohtml).
*/
public void getOutlinks(URL base, ArrayList<Outlink> outlinks, Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
NodeList children = currentNode.getChildNodes();
int childLen = (children != null) ? children.getLength() : 0;
if (nodeType == Node.ELEMENT_NODE) {
nodeName = nodeName.toLowerCase();
LinkParams params = (LinkParams) linkParams.get(nodeName);
if (params != null) {
if (!shouldThrowAwayLink(currentNode, children, childLen, params)) {
StringBuffer linkText = new StringBuffer();
getText(linkText, currentNode, true);
NamedNodeMap attrs = currentNode.getAttributes();
String target = null;
boolean noFollow = false;
boolean post = false;
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = attr.getNodeName();
if (params.attrName.equalsIgnoreCase(attrName)) {
target = attr.getNodeValue();
} else if ("rel".equalsIgnoreCase(attrName) && "nofollow".equalsIgnoreCase(attr.getNodeValue())) {
noFollow = true;
} else if ("method".equalsIgnoreCase(attrName) && "post".equalsIgnoreCase(attr.getNodeValue())) {
post = true;
}
}
if (target != null && !noFollow && !post)
try {
URL url = URLUtil.resolveURL(base, target);
Outlink outlink = new Outlink(url.toString(), linkText.toString().trim());
outlinks.add(outlink);
// the outlink metadata
if (keepNodenames) {
MapWritable metadata = new MapWritable();
metadata.put(new Text(srcTagMetaName), new Text(nodeName));
outlink.setMetadata(metadata);
}
} catch (MalformedURLException e) {
// don't care
}
}
// this should not have any children, skip them
if (params.childLen == 0)
continue;
}
}
}
}
Aggregations