use of org.htmlparser.util.NodeList in project liferay-docs by liferay.
the class CheckLinks method isLdnUrlValid.
/**
* Returns <code>true</code> if the LDN URL is valid. This method is used to
* check legacy URLs hosted on LDN.
*
* @param url the URL to check
* @param fileName the article's name
* @param lineNumber the line number
* @return <code>true</code> if the LDN URL is valid; <code>false</code>
* otherwise
* @throws IOException if an IO exception occurred
*/
private static boolean isLdnUrlValid(String url, File article, int lineNumber) throws IOException {
NodeList list = new NodeList();
boolean validLDNURL = false;
try {
Parser htmlParser = new Parser(url);
list = htmlParser.extractAllNodesThatMatch(new NodeClassFilter(LinkTag.class));
} catch (ParserException e) {
logInvalidUrl(article, lineNumber, ldnArticle, false);
}
List<String> results = new LinkedList<String>();
for (int i = 0; i < list.size(); i++) {
LinkTag link = (LinkTag) list.elementAt(i);
String linkString = link.getLink();
results.add(linkString);
}
for (String x : results) {
if (x.contains("2Fsearch%2Fsearch&_3_redirect=")) {
logInvalidUrl(article, lineNumber, ldnArticle, false);
} else {
validLDNURL = true;
}
}
return validLDNURL;
}
use of org.htmlparser.util.NodeList in project laogewen by wen4034.
the class HtmlParserTool method extracLinks.
public static Set<String> extracLinks(String url, LinkFilter filter, String... validate) {
Set<String> links = Sets.newHashSet();
try {
URL realurl = new URL(url);
URLConnection connection = realurl.openConnection();
connection.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
connection.setReadTimeout(100000);
connection.setConnectTimeout(100000);
Parser parser = new Parser(connection);
parser.setEncoding("UTF-8");
// 过滤<frame>标签的filter,用来提取frame标签的src属性
NodeFilter frameFilter = new NodeFilter() {
@Override
public boolean accept(Node node) {
if (node.getText().startsWith("frame src=")) {
return true;
} else {
return false;
}
}
};
OrFilter linkFilter = new OrFilter(new NodeClassFilter(LinkTag.class), frameFilter);
NodeList list = parser.extractAllNodesThatMatch(linkFilter);
for (int i = 0; i < list.size(); i++) {
Node tag = list.elementAt(i);
if (tag instanceof LinkTag) {
LinkTag linkTag = (LinkTag) tag;
String linkurl = linkTag.getLink();
if (filter.accept(linkurl, validate)) {
links.add(linkurl);
}
} else {
String fram = tag.getText();
int start = fram.indexOf("src=");
fram = fram.substring(start);
int end = fram.indexOf(" ");
if (end == -1) {
end = fram.indexOf(">");
}
String frameUrl = fram.substring(5, end - 1);
if (filter.accept(frameUrl, validate)) {
links.add(frameUrl);
}
}
}
} catch (Exception e) {
System.out.println(url + "链接失败");
e.printStackTrace();
}
return links;
}
Aggregations