use of org.jsoup.nodes.Node in project jsoup by jhy.
the class Parser method parseBodyFragment.
/**
* Parse a fragment of HTML into the {@code body} of a Document.
*
* @param bodyHtml fragment of HTML
* @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
*
* @return Document, with empty head, and HTML parsed into body
*/
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
// the node list gets modified when re-parented
Node[] nodes = nodeList.toArray(new Node[0]);
for (int i = nodes.length - 1; i > 0; i--) {
nodes[i].remove();
}
for (Node node : nodes) {
body.appendChild(node);
}
return doc;
}
use of org.jsoup.nodes.Node in project Asqatasun by Asqatasun.
the class HTMLJsoupCleanerImpl method removeComments.
/**
* Remove the comments of the page
*
* @param node
*/
private void removeComments(Node node) {
// as we are removing child nodes while iterating, we cannot use a normal foreach over children,
// or will get a concurrent list modification error.
int i = 0;
while (i < node.childNodes().size()) {
Node child = node.childNode(i);
if (child.nodeName().equals("#comment"))
child.remove();
else {
removeComments(child);
i++;
}
}
}
use of org.jsoup.nodes.Node in project sonarqube by SonarSource.
the class HtmlParagraphAssert method toLines.
private static List<String> toLines(Element parent) {
Iterator<Node> iterator = parent.childNodes().iterator();
if (!iterator.hasNext()) {
return emptyList();
}
List<String> actualLines = new ArrayList<>(parent.childNodeSize());
StringBuilder currentLine = null;
while (iterator.hasNext()) {
Node node = iterator.next();
if (node instanceof TextNode) {
if (currentLine == null) {
currentLine = new StringBuilder(node.toString());
} else {
currentLine.append(node.toString());
}
} else if (node instanceof Element) {
Element element = (Element) node;
if (element.tagName().equals("br")) {
actualLines.add(currentLine == null ? "" : currentLine.toString());
currentLine = null;
} else {
if (currentLine == null) {
currentLine = new StringBuilder(element.text());
} else {
currentLine.append(element.text());
}
}
} else {
throw new IllegalStateException("unsupported node " + node.getClass());
}
if (!iterator.hasNext()) {
actualLines.add(currentLine == null ? "" : currentLine.toString());
currentLine = null;
}
}
return actualLines;
}
use of org.jsoup.nodes.Node in project Java-readability by basis-technology-corp.
the class Readability method changeElementTag.
private Element changeElementTag(Element e, String newTag) {
Element newElement = document.createElement(newTag);
/* JSoup gives us the live child list, so we need to make a copy. */
List<Node> copyOfChildNodeList = new ArrayList<Node>();
copyOfChildNodeList.addAll(e.childNodes());
for (Node n : copyOfChildNodeList) {
n.remove();
newElement.appendChild(n);
}
e.replaceWith(newElement);
return newElement;
}
use of org.jsoup.nodes.Node in project Java-readability by basis-technology-corp.
the class Readability method grabArticle.
// CHECKSTYLE:OFF
private Element grabArticle(Element pageElement) {
boolean isPaging = pageElement != null;
if (pageElement == null) {
pageElement = body;
}
String pageCacheHtml = pageElement.html();
Elements allElements = pageElement.getAllElements();
/*
* Note: in Javascript, this list would be *live*. If you deleted a node from the tree, it and its
* children would remove themselves. To get the same effect, we make a linked list and we remove
* things from it. This won't win prizes for speed, but, then again, the code in Javascript has to be
* doing something nearly as awful.
*/
LinkedList<Element> allElementsList = new LinkedList<Element>();
allElementsList.addAll(allElements);
/**
* First, node prepping. Trash nodes that look cruddy (like ones with the class name "comment", etc),
* and turn divs into P tags where they have been used inappropriately (as in, where they contain no
* other block level elements.) Note: Assignment from index for performance. See
* http://www.peachpit.com/articles/article.aspx?p=31567&seqNum=5 TODO: Shouldn't this be a reverse
* traversal?
*/
List<Element> nodesToScore = new ArrayList<Element>();
ListIterator<Element> elIterator = allElementsList.listIterator();
Set<Element> goodAsDead = new HashSet<Element>();
while (elIterator.hasNext()) {
Element node = elIterator.next();
if (goodAsDead.contains(node)) {
continue;
}
/* Remove unlikely candidates */
if (stripUnlikelyCandidates) {
String unlikelyMatchString = node.className() + node.id();
if (Patterns.exists(Patterns.UNLIKELY_CANDIDATES, unlikelyMatchString) && !Patterns.exists(Patterns.OK_MAYBE_ITS_A_CANDIDATE, unlikelyMatchString) && !"body".equals(node.tagName())) {
LOG.debug("Removing unlikely candidate - " + unlikelyMatchString);
List<Element> toRemoveAndBelow = node.getAllElements();
elIterator.remove();
/*
* adding 'node' to that set is harmless and reduces the code complexity here.
*/
goodAsDead.addAll(toRemoveAndBelow);
continue;
}
}
if ("p".equals(node.tagName()) || "td".equals(node.tagName()) || "pre".equals(node.tagName())) {
nodesToScore.add(node);
}
/*
* Turn all divs that don't have children block level elements into p's
*/
if ("div".equals(node.tagName())) {
boolean hasBlock = false;
for (Element divChild : node.getAllElements()) {
if (divChild != node) {
if (DIV_TO_P_ELEMENTS.contains(divChild.tagName())) {
hasBlock = true;
break;
}
}
}
if (!hasBlock) {
Element newElement = changeElementTag(node, "p");
nodesToScore.remove(node);
nodesToScore.add(newElement);
} else {
/* EXPERIMENTAL */
/*
* grab just child text and wrap each chunk in a p
*/
int limit = node.childNodes().size();
for (int i = 0; i < limit; i++) {
Node childNode = node.childNodes().get(i);
if (childNode instanceof TextNode) {
Element p = document.createElement("p");
p.attr("basisInline", "true");
p.html(((TextNode) childNode).text());
childNode.replaceWith(p);
}
}
}
}
}
/**
* Loop through all paragraphs, and assign a score to them based on how content-y they look. Then add
* their score to their parent node. A score is determined by things like number of commas, class
* names, etc. Maybe eventually link density.
*/
List<Element> candidates = new ArrayList<Element>();
for (Element nodeToScore : nodesToScore) {
Element parentNode = nodeToScore.parent();
if (null == parentNode) {
// dropped previously.
continue;
}
Element grandParentNode = parentNode.parent();
if (grandParentNode == null) {
// ditto
continue;
}
String innerText = nodeToScore.text();
/*
* If this paragraph is less than 25 characters, don't even count it.
*/
if (innerText.length() < 25) {
continue;
}
/* Initialize readability data for the parent. */
if ("".equals(parentNode.attr("readability"))) {
initializeNode(parentNode);
candidates.add(parentNode);
}
/*
* If the grandparent has no parent, we don't want it as a candidate. It's probably a symptom that
* we're operating in an orphan.
*/
if (grandParentNode.parent() != null && "".equals(grandParentNode.attr("readability"))) {
initializeNode(grandParentNode);
candidates.add(grandParentNode);
}
double contentScore = 0;
/* Add a point for the paragraph itself as a base. */
contentScore++;
/* Add points for any commas within this paragraph */
contentScore += innerText.split(",").length;
/*
* For every 100 characters in this paragraph, add another point. Up to 3 points.
*/
contentScore += Math.min(Math.floor(innerText.length() / 100.0), 3.0);
/* Add the score to the parent. The grandparent gets half. */
incrementContentScore(parentNode, contentScore);
if (grandParentNode != null) {
incrementContentScore(grandParentNode, contentScore / 2.0);
}
}
/**
* After we've calculated scores, loop through all of the possible candidate nodes we found and find
* the one with the highest score.
*/
Element topCandidate = null;
for (Element candidate : candidates) {
/**
* Scale the final candidates score based on link density. Good content should have a relatively
* small link density (5% or less) and be mostly unaffected by this operation.
*/
double score = getContentScore(candidate);
double newScore = score * (1.0 - getLinkDensity(candidate));
setContentScore(candidate, newScore);
LOG.debug("Candidate [" + candidate.getClass() + "] (" + candidate.className() + ":" + candidate.id() + ") with score " + newScore);
if (null == topCandidate || newScore > getContentScore(topCandidate)) {
topCandidate = candidate;
}
}
/**
* If we still have no top candidate, just use the body as a last resort. We also have to copy the
* body node so it is something we can modify.
*/
if (topCandidate == null || topCandidate == body) {
topCandidate = document.createElement("div");
// not efficient but not likely.
topCandidate.html(pageElement.html());
pageElement.html("");
pageElement.appendChild(topCandidate);
initializeNode(topCandidate);
}
/**
* Now that we have the top candidate, look through its siblings for content that might also be
* related. Things like preambles, content split by ads that we removed, etc.
*/
Element articleContent = document.createElement("div");
if (isPaging) {
articleContent.attr("id", "readability-content");
}
double siblingScoreThreshold = Math.max(10, getContentScore(topCandidate) * 0.2);
List<Element> siblingNodes = topCandidate.parent().children();
for (Element siblingNode : siblingNodes) {
boolean scored = isElementScored(siblingNode);
boolean append = false;
LOG.debug("Looking at sibling node: [" + siblingNode.getClass() + "] (" + siblingNode.className() + ":" + siblingNode.id() + ")");
if (scored) {
LOG.debug("Sibling has score " + getContentScore(siblingNode));
} else {
LOG.debug("Sibling has score unknown");
}
if (siblingNode == topCandidate) {
append = true;
}
double contentBonus = 0;
/*
* Give a bonus if sibling nodes and top candidates have the example same classname
*/
if (siblingNode.className().equals(topCandidate.className()) && !"".equals(topCandidate.className())) {
contentBonus += getContentScore(topCandidate) * 0.2;
}
if (scored && (getContentScore(siblingNode) + contentBonus >= siblingScoreThreshold)) {
append = true;
}
if ("p".equals(siblingNode.tagName())) {
double linkDensity = getLinkDensity(siblingNode);
String nodeContent = siblingNode.text();
int nodeLength = nodeContent.length();
if (nodeLength > 80 && linkDensity < 0.25) {
append = true;
} else if (nodeLength < 80 && linkDensity == 0 && Patterns.exists(Patterns.ENDS_WITH_DOT, nodeContent)) {
append = true;
}
}
if (append) {
LOG.debug("Appending node: [" + siblingNode.getClass() + "]");
Element nodeToAppend = null;
if (!"div".equals(siblingNode.tagName()) && !"p".equals(siblingNode.tagName())) {
/*
* We have a node that isn't a common block level element, like a form or td tag. Turn it
* into a div so it doesn't get filtered out later by accident.
*/
LOG.debug("Altering siblingNode of " + siblingNode.tagName() + " to div.");
nodeToAppend = changeElementTag(siblingNode, "div");
} else {
nodeToAppend = siblingNode;
}
/*
* To ensure a node does not interfere with readability styles, remove its classnames
*/
nodeToAppend.removeAttr("class");
/*
* Append sibling and subtract from our list because it removes the node when you append to
* another node
*/
articleContent.appendChild(nodeToAppend);
}
}
document.body().empty();
document.body().appendChild(articleContent);
/**
* So we have all of the content that we need. Now we clean it up for presentation.
*/
prepArticle(articleContent);
/**
* Now that we've gone through the full algorithm, check to see if we got any meaningful content. If
* we didn't, we may need to re-run grabArticle with different flags set. This gives us a higher
* likelihood of finding the content, and the sieve approach gives us a higher likelihood of finding
* the -right- content.
*/
if (articleContent.text().length() < 250) {
pageElement.html(pageCacheHtml);
if (stripUnlikelyCandidates) {
try {
stripUnlikelyCandidates = false;
return grabArticle(pageElement);
} finally {
stripUnlikelyCandidates = true;
}
} else if (classWeight) {
try {
classWeight = false;
return grabArticle(pageElement);
} finally {
classWeight = true;
}
} else if (cleanConditionally) {
try {
cleanConditionally = false;
return grabArticle(pageElement);
} finally {
cleanConditionally = true;
}
} else {
return null;
}
}
return articleContent;
}
Aggregations