use of org.jsoup.nodes.Node in project jsoup by jhy.
the class Cleaner method isValidBodyHtml.
public boolean isValidBodyHtml(String bodyHtml) {
Document clean = Document.createShell("");
Document dirty = Document.createShell("");
ParseErrorList errorList = ParseErrorList.tracking(1);
List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), "", errorList);
dirty.body().insertChildren(0, nodes);
int numDiscarded = copySafeNodes(dirty.body(), clean.body());
return numDiscarded == 0 && errorList.size() == 0;
}
use of org.jsoup.nodes.Node in project Asqatasun by Asqatasun.
the class HTMLJsoupCleanerImpl method removeComments.
/**
* Remove the comments of the page
*
* @param node
*/
private void removeComments(Node node) {
// as we are removing child nodes while iterating, we cannot use a normal foreach over children,
// or will get a concurrent list modification error.
int i = 0;
while (i < node.childNodes().size()) {
Node child = node.childNode(i);
if (child.nodeName().equals("#comment"))
child.remove();
else {
removeComments(child);
i++;
}
}
}
use of org.jsoup.nodes.Node in project pratilipi by Pratilipi.
the class PratilipiDocUtil method _setPage.
private static void _setPage(Long pratilipiId, PratilipiContentDoc.Page page, String html) throws InvalidArgumentException, UnexpectedServerException {
// Deleting Existing Pagelets
page.deleteAllPagelets();
// Adding Pagelets from HTML
if (html != null && !html.trim().isEmpty()) {
Node body = Jsoup.parse(html).body();
Node badNode = _validateContent(body);
if (badNode != null) {
String errMsg = "";
while (badNode != body) {
errMsg = " > " + badNode.nodeName() + errMsg;
badNode = badNode.parent();
}
errMsg = "Invalid node " + errMsg;
throw new InvalidArgumentException(errMsg);
}
for (Node node : body.childNodes()) {
if (node.nodeName().equals("p")) {
if (node.childNodeSize() == 1 && node.childNode(0).equals("img")) {
JsonObject imgData = _createImageData(pratilipiId, node.childNode(0));
if (imgData != null)
page.addPagelet(PageletType.IMAGE, imgData);
} else {
page.addPagelet(PageletType.HTML, ((Element) node).html(), _getAlignment(node));
}
} else if (node.nodeName().equals("img")) {
JsonObject imgData = _createImageData(pratilipiId, node);
if (imgData != null)
page.addPagelet(PageletType.IMAGE, imgData);
} else if (node.nodeName().equals("blockquote")) {
page.addPagelet(PageletType.BLOCK_QUOTE, ((Element) node).html());
} else if (node.nodeName().equals("ol")) {
page.addPagelet(PageletType.LIST_ORDERED, ((Element) node).html());
} else if (node.nodeName().equals("ul")) {
page.addPagelet(PageletType.LIST_UNORDERED, ((Element) node).html());
}
}
}
}
use of org.jsoup.nodes.Node in project pratilipi by Pratilipi.
the class PratilipiDocUtil method _createPageletList.
private static List<Object[]> _createPageletList(Pratilipi pratilipi, Node node) throws UnexpectedServerException {
List<Object[]> pageletList = new LinkedList<>();
Object[] currPagelet = null;
for (Node childNode : node.childNodes()) {
if (childNode.nodeName().equals("body") || childNode.nodeName().equals("div") || childNode.nodeName().equals("p")) {
currPagelet = null;
List<Object[]> pList = _createPageletList(pratilipi, childNode);
if (pList.size() == 0) {
pageletList.add(new Object[] { PratilipiContentDoc.PageletType.HTML, "<br/>", null });
} else {
AlignmentType alignment = _getAlignment(childNode);
if (alignment != null)
for (Object[] pagelet : pList) if (pagelet[2] == null && (pagelet[0] == PratilipiContentDoc.PageletType.TEXT || pagelet[0] == PratilipiContentDoc.PageletType.HTML))
pagelet[2] = alignment;
pageletList.addAll(pList);
}
} else if (childNode.nodeName().equals("h1") || childNode.nodeName().equals("h2")) {
String text = _extractText(childNode);
if (text == null)
continue;
if (currPagelet != null && currPagelet[0] == PratilipiContentDoc.PageletType.HEAD) {
currPagelet[1] = currPagelet[1] + " - " + text;
} else {
currPagelet = new Object[] { PratilipiContentDoc.PageletType.HEAD, text, null };
pageletList.add(currPagelet);
}
} else if (childNode.nodeName().equals("img")) {
currPagelet = null;
BlobAccessor blobAccessor = DataAccessorFactory.getBlobAccessor();
BlobEntry blobEntry = null;
String imageUrl = childNode.attr("src");
String imageName = null;
if (imageUrl.indexOf("name=") != -1) {
imageName = imageUrl.substring(imageUrl.indexOf("name=") + 5);
if (imageName.indexOf('&') != -1)
imageName = imageName.substring(0, imageName.indexOf('&'));
imageName = imageName.replace("%20", " ");
String fileName = _createImageFullName(pratilipi.getId(), imageName);
blobEntry = blobAccessor.getBlob(fileName);
if (blobEntry == null) {
// Copying from old resource location
blobEntry = blobAccessor.getBlob("pratilipi-resource/" + pratilipi.getId() + "/" + imageName);
if (blobEntry != null) {
blobEntry.setName(fileName);
blobAccessor.createOrUpdateBlob(blobEntry);
}
}
if (blobEntry == null && imageUrl.indexOf("pratilipiId=") != -1) {
// Copying from old resource location of another Pratilipi
String pratilipiIdStr = imageUrl.substring(imageUrl.indexOf("pratilipiId=") + 12);
if (pratilipiIdStr.indexOf('&') != -1)
pratilipiIdStr = pratilipiIdStr.substring(0, pratilipiIdStr.indexOf('&'));
blobEntry = blobAccessor.getBlob("pratilipi-resource/" + pratilipiIdStr + "/" + imageName);
if (blobEntry != null) {
blobEntry.setName(fileName);
blobAccessor.createOrUpdateBlob(blobEntry);
}
}
if (blobEntry == null)
continue;
} else if (imageUrl.startsWith("http")) {
imageName = imageUrl.replaceAll("[:/.?=&+]+", "_");
String fileName = _createImageFullName(pratilipi.getId(), imageName);
blobEntry = blobAccessor.getBlob(fileName);
if (blobEntry == null) {
blobEntry = HttpUtil.doGet(imageUrl);
if (!blobEntry.getMimeType().startsWith("image/"))
continue;
blobEntry.setName(fileName);
blobAccessor.createOrUpdateBlob(blobEntry);
}
} else if (imageUrl.startsWith("data:") && imageUrl.indexOf("base64") != -1) {
imageName = UUID.randomUUID().toString();
String mimeType = imageUrl.substring(5, imageUrl.indexOf(';'));
String base64String = imageUrl.substring(imageUrl.indexOf("base64,") + 7);
blobEntry = blobAccessor.newBlob(_createImageFullName(pratilipi.getId(), imageName), Base64.decodeBase64(base64String), mimeType);
blobAccessor.createOrUpdateBlob(blobEntry);
} else if (imageUrl.startsWith("file:///") || imageUrl.startsWith("C:")) {
continue;
}
JsonObject imgData = new JsonObject();
imgData.addProperty("name", imageName);
imgData.addProperty("height", ImageUtil.getHeight(blobEntry));
imgData.addProperty("width", ImageUtil.getWidth(blobEntry));
pageletList.add(new Object[] { PratilipiContentDoc.PageletType.IMAGE, imgData, null });
} else if (childNode.nodeName().equals("br")) {
if (currPagelet != null && currPagelet[0] == PratilipiContentDoc.PageletType.HTML)
currPagelet[1] = currPagelet[1] + "<br/>";
} else {
String text = _extractText(childNode);
if (text == null)
continue;
if (childNode.nodeName().equals("b") || childNode.nodeName().equals("strong") || childNode.nodeName().equals("h3") || childNode.nodeName().equals("h4") || childNode.nodeName().equals("h5") || childNode.nodeName().equals("h6"))
text = "<b>" + text + "</b>";
if (currPagelet == null || currPagelet[0] != PratilipiContentDoc.PageletType.HTML) {
currPagelet = new Object[] { PratilipiContentDoc.PageletType.HTML, text, null };
pageletList.add(currPagelet);
} else {
currPagelet[1] = currPagelet[1] + " " + text;
}
}
}
return pageletList;
}
use of org.jsoup.nodes.Node in project Lightning-Browser by anthonycr.
the class OutputFormatter method appendTextSkipHidden.
private void appendTextSkipHidden(Element e, StringBuilder accum, int indent) {
for (Node child : e.childNodes()) {
if (unlikely(child)) {
continue;
}
if (child instanceof TextNode) {
TextNode textNode = (TextNode) child;
String txt = textNode.text();
accum.append(txt);
} else if (child instanceof Element) {
Element element = (Element) child;
if (accum.length() > 0 && element.isBlock() && !lastCharIsWhitespace(accum))
accum.append(' ');
else if (element.tagName().equals("br"))
accum.append(' ');
appendTextSkipHidden(element, accum, indent + 1);
}
}
}
Aggregations