use of org.apache.nutch.util.NodeWalker in project nutch by apache.
the class DOMContentUtils method getTitle.
/**
* This method takes a {@link StringBuffer} and a DOM {@link Node}, and will
* append the content text found beneath the first <code>title</code> node to
* the <code>StringBuffer</code>.
*
* @return true if a title node was found, false otherwise
*/
public boolean getTitle(StringBuffer sb, Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
if ("body".equalsIgnoreCase(nodeName)) {
// stop after HEAD
return false;
}
if (nodeType == Node.ELEMENT_NODE) {
if ("title".equalsIgnoreCase(nodeName)) {
getText(sb, currentNode);
return true;
}
}
}
return false;
}
use of org.apache.nutch.util.NodeWalker in project nutch by apache.
the class HeadingsParseFilter method getElement.
/**
* Finds the specified element and returns its value
*/
protected List<String> getElement(DocumentFragment doc, String element) {
List<String> headings = new ArrayList<>();
NodeWalker walker = new NodeWalker(doc);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
if (currentNode.getNodeType() == Node.ELEMENT_NODE) {
if (element.equalsIgnoreCase(currentNode.getNodeName())) {
headings.add(getNodeValue(currentNode));
// to discover more headings.
if (!multiValued) {
break;
}
}
}
}
return headings;
}
use of org.apache.nutch.util.NodeWalker in project nutch by apache.
the class HeadingsParseFilter method getNodeValue.
/**
* Returns the text value of the specified Node and child nodes
*/
protected static String getNodeValue(Node node) {
StringBuilder buffer = new StringBuilder();
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
final Node n = walker.nextNode();
if (n.getNodeType() == Node.TEXT_NODE) {
buffer.append(n.getNodeValue());
}
}
// Return with stripped surplus whitespace
Matcher matcher = whitespacePattern.matcher(buffer.toString().trim());
return matcher.replaceAll(" ").trim();
}
use of org.apache.nutch.util.NodeWalker in project nutch by apache.
the class DOMContentUtils method getTextHelper.
// returns true if abortOnNestedAnchors is true and we find nested
// anchors
private boolean getTextHelper(StringBuffer sb, Node node, boolean abortOnNestedAnchors, int anchorDepth) {
boolean abort = false;
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
if ("script".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
if ("style".equalsIgnoreCase(nodeName)) {
walker.skipChildren();
}
if (abortOnNestedAnchors && "a".equalsIgnoreCase(nodeName)) {
anchorDepth++;
if (anchorDepth > 1) {
abort = true;
break;
}
}
if (nodeType == Node.COMMENT_NODE) {
walker.skipChildren();
}
if (nodeType == Node.TEXT_NODE) {
// cleanup and trim the value
String text = currentNode.getNodeValue();
text = text.replaceAll("\\s+", " ");
text = text.trim();
if (text.length() > 0) {
appendSpace(sb);
sb.append(text);
} else {
appendParagraphSeparator(sb);
}
}
}
return abort;
}
use of org.apache.nutch.util.NodeWalker in project nutch by apache.
the class DOMContentUtils method getBase.
/**
* If Node contains a BASE tag then it's HREF is returned.
*/
public String getBase(Node node) {
NodeWalker walker = new NodeWalker(node);
while (walker.hasNext()) {
Node currentNode = walker.nextNode();
String nodeName = currentNode.getNodeName();
short nodeType = currentNode.getNodeType();
// is this node a BASE tag?
if (nodeType == Node.ELEMENT_NODE) {
if ("body".equalsIgnoreCase(nodeName)) {
// stop after HEAD
return null;
}
if ("base".equalsIgnoreCase(nodeName)) {
NamedNodeMap attrs = currentNode.getAttributes();
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
if ("href".equalsIgnoreCase(attr.getNodeName())) {
return attr.getNodeValue();
}
}
}
}
}
// no.
return null;
}
Aggregations