use of org.htmlparser.Node in project jforum2 by rafaelsteil.
the class SafeHtml method makeSafe.
/**
* Given an input, makes it safe for HTML displaying.
* Removes any not allowed HTML tag or attribute, as well
* unwanted Javascript statements inside the tags.
* @param contents the input to analyze
* @return the modified and safe string
*/
public String makeSafe(String contents) {
if (contents == null || contents.length() == 0) {
return contents;
}
StringBuffer sb = new StringBuffer(contents.length());
try {
Lexer lexer = new Lexer(contents);
Node node;
while ((node = lexer.nextNode()) != null) {
boolean isTextNode = node instanceof TextNode;
if (isTextNode) {
// Text nodes are raw data, so we just
// strip off all possible html content
String text = node.toHtml();
if (text.indexOf('>') > -1 || text.indexOf('<') > -1) {
StringBuffer tmp = new StringBuffer(text);
ViewCommon.replaceAll(tmp, "<", "<");
ViewCommon.replaceAll(tmp, ">", ">");
ViewCommon.replaceAll(tmp, "\"", """);
node.setText(tmp.toString());
}
}
if (isTextNode || (node instanceof Tag && this.isTagWelcome(node))) {
sb.append(node.toHtml());
} else {
StringBuffer tmp = new StringBuffer(node.toHtml());
ViewCommon.replaceAll(tmp, "<", "<");
ViewCommon.replaceAll(tmp, ">", ">");
sb.append(tmp.toString());
}
}
} catch (Exception e) {
throw new ForumException("Error while parsing HTML: " + e, e);
}
return sb.toString();
}
use of org.htmlparser.Node in project dhis2-core by dhis2.
the class GridUtils method getColumnCount.
/**
* Returns the number of columns/cells in the given row, including cell
* spacing.
*/
private static int getColumnCount(TableRow row) {
Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray();
int cols = 0;
for (Node cell : cells) {
Integer colSpan = MathUtils.parseInt(((TagNode) cell).getAttribute("colspan"));
cols += colSpan != null ? colSpan : 1;
}
return cols;
}
use of org.htmlparser.Node in project jforum2 by rafaelsteil.
the class SafeHtml method ensureAllAttributesAreSafe.
/**
* Given an input, analyze each HTML tag and remove unsecure attributes from them.
* @param contents The content to verify
* @return the content, secure.
*/
public String ensureAllAttributesAreSafe(String contents) {
StringBuffer sb = new StringBuffer(contents.length());
try {
Lexer lexer = new Lexer(contents);
Node node;
while ((node = lexer.nextNode()) != null) {
if (node instanceof Tag) {
Tag tag = (Tag) node;
this.checkAndValidateAttributes(tag, false);
sb.append(tag.toHtml());
} else {
sb.append(node.toHtml());
}
}
} catch (Exception e) {
throw new ForumException("Problems while parsing HTML: " + e, e);
}
return sb.toString();
}
use of org.htmlparser.Node in project omegat by omegat-org.
the class FilterVisitor method endup.
/**
* Ends the segment collection and sends the translatable text out to OmegaT
* core, and some extra tags to writer.
*/
protected void endup() {
// detecting the first starting tag in 'befors'
// that has its ending in the paragraph
// all before this "first good" are simply written out
List<Node> all = new ArrayList<Node>();
all.addAll(befors);
all.addAll(translatable);
int firstgoodlimit = befors.size();
int firstgood = 0;
while (firstgood < firstgoodlimit) {
Node goodNode = all.get(firstgood);
if (!(goodNode instanceof Tag)) {
firstgood++;
continue;
}
Tag good = (Tag) goodNode;
// trying to test
int recursion = 1;
boolean found = false;
for (int i = firstgood + 1; i < all.size(); i++) {
Node candNode = all.get(i);
if (candNode instanceof Tag) {
Tag cand = (Tag) candNode;
if (cand.getTagName().equals(good.getTagName())) {
if (!cand.isEndTag()) {
recursion++;
} else {
recursion--;
if (recursion == 0) {
if (i >= firstgoodlimit) {
found = true;
}
// we've found an ending tag for this "good one"
break;
}
}
}
}
}
// this is a "good one"
if (found) {
break;
}
firstgood++;
}
// detecting the last ending tag in 'afters'
// that has its starting in the paragraph
// all after this "last good" is simply writen out
int lastgoodlimit = all.size() - 1;
all.addAll(afters);
int lastgood = all.size() - 1;
while (lastgood > lastgoodlimit) {
Node goodNode = all.get(lastgood);
if (!(goodNode instanceof Tag)) {
lastgood--;
continue;
}
Tag good = (Tag) goodNode;
// trying to test
int recursion = 1;
boolean found = false;
for (int i = lastgood - 1; i >= firstgoodlimit; i--) {
Node candNode = all.get(i);
if (candNode instanceof Tag) {
Tag cand = (Tag) candNode;
if (cand.getTagName().equals(good.getTagName())) {
if (cand.isEndTag()) {
recursion++;
} else {
recursion--;
if (recursion == 0) {
if (i <= lastgoodlimit) {
found = true;
}
// "good one"
break;
}
}
}
}
}
// this is a "good one"
if (found) {
break;
}
lastgood--;
}
boolean changed = true;
while (changed) {
changed = false;
boolean removeTags = Core.getFilterMaster().getConfig().isRemoveTags();
if (!removeTags) {
for (int i = 0; i < firstgood; i++) {
Node node = all.get(i);
if (node instanceof Tag) {
firstgood = i;
changed = true;
break;
}
}
for (int i = all.size() - 1; i > lastgood; i--) {
Node node = all.get(i);
if (node instanceof Tag) {
lastgood = i;
changed = true;
break;
}
}
}
boolean removeSpacesAround = Core.getFilterMaster().getConfig().isRemoveSpacesNonseg();
if (!removeSpacesAround) {
for (int i = 0; i < firstgood; i++) {
Node node = all.get(i);
if (node instanceof TextNode) {
firstgood = i;
changed = true;
break;
}
}
for (int i = all.size() - 1; i > lastgood; i--) {
Node node = all.get(i);
if (node instanceof TextNode) {
lastgood = i;
changed = true;
break;
}
}
}
}
// writing out all tags before the "first good" one
for (int i = 0; i < firstgood; i++) {
Node node = all.get(i);
if (node instanceof Tag) {
writeout("<" + node.getText() + ">");
} else {
writeout(compressWhitespace(node.getText()));
}
}
// appending all tags until "last good" one to paragraph text
StringBuilder paragraph = new StringBuilder();
// appending all tags starting from "first good" one to paragraph text
for (int i = firstgood; i <= lastgood; i++) {
Node node = all.get(i);
if (node instanceof Tag) {
shortcut((Tag) node, paragraph);
} else {
// node instanceof Text
paragraph.append(HTMLUtils.entitiesToChars(node.toHtml()));
}
}
String uncompressed = paragraph.toString();
String compressed = uncompressed;
String spacePrefix = "";
String spacePostfix = "";
int size = uncompressed.length();
// (This changes the layout, therefore it is an option)
if (!preformatting) {
for (int cp, i = 0; i < size; i += Character.charCount(cp)) {
cp = uncompressed.codePointAt(i);
if (!Character.isWhitespace(cp)) {
spacePrefix = i == 0 ? "" : uncompressed.substring(0, options.getCompressWhitespace() ? Math.min(i, uncompressed.offsetByCodePoints(i, 1)) : i);
break;
}
}
for (int cp, i = size; i > 0; i -= Character.charCount(cp)) {
cp = uncompressed.codePointBefore(i);
if (!Character.isWhitespace(cp)) {
spacePostfix = i == size ? "" : uncompressed.substring(i, options.getCompressWhitespace() ? Math.min(uncompressed.offsetByCodePoints(i, 1), size) : size);
break;
}
}
if (Core.getFilterMaster().getConfig().isRemoveSpacesNonseg()) {
compressed = StringUtil.compressSpaces(uncompressed);
} else {
compressed = uncompressed;
}
}
// getting the translation
String translation = filter.privateProcessEntry(compressed, null);
// writing out uncompressed
if (compressed.equals(translation) && !options.getCompressWhitespace()) {
translation = uncompressed;
}
// converting & < and > into & < and > respectively
// note that this doesn't change < and > of tag shortcuts
translation = HTMLUtils.charsToEntities(translation, filter.getTargetEncoding(), sShortcuts);
// expands tag shortcuts into full-blown tags
translation = unshorcutize(translation);
// writing out the paragraph into target file
writeout(spacePrefix);
writeout(translation);
writeout(spacePostfix);
// writing out all tags after the "last good" one
for (int i = lastgood + 1; i < all.size(); i++) {
Node node = all.get(i);
if (node instanceof Tag) {
writeout("<" + node.getText() + ">");
} else {
writeout(compressWhitespace(node.getText()));
}
}
cleanup();
}
use of org.htmlparser.Node in project laogewen by wen4034.
the class HtmlParserTool method extracLinks.
public static Set<String> extracLinks(String url, LinkFilter filter, String... validate) {
Set<String> links = Sets.newHashSet();
try {
URL realurl = new URL(url);
URLConnection connection = realurl.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
connection.setReadTimeout(100000);
connection.setConnectTimeout(100000);
Parser parser = new Parser(connection);
parser.setEncoding("UTF-8");
// 过滤<frame>标签的filter,用来提取frame标签的src属性
NodeFilter frameFilter = new NodeFilter() {
@Override
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag) {
LinkTag linkTag = (LinkTag) tag;
String linkurl = linkTag.getLink();
if (filter.accept(linkurl, validate)) {
links.add(linkurl);
}
} else {
String fram = tag.getText();
int start = fram.indexOf("src=");
fram = fram.substring(start);
int end = fram.indexOf(" ");
if (end == -1) {
end = fram.indexOf(">");
}
String frameUrl = fram.substring(5, end - 1);
if (filter.accept(frameUrl, validate)) {
links.add(frameUrl);
}
}
}
} catch (Exception e) {
System.out.println(url + "链接失败");
e.printStackTrace();
}
return links;
}
Aggregations