use of com.jsoup.nodes.Document in project User-Behavior-in-Facebook by abozanona.
the class ListLinks method main.
public static void main(String[] args) throws IOException {
Validate.isTrue(args.length == 1, "usage: supply url to fetch");
String url = args[0];
print("Fetching %s...", url);
Document doc = Jsoup.connect(url).get();
Elements links = doc.select("a[href]");
Elements media = doc.select("[src]");
Elements imports = doc.select("link[href]");
print("\nMedia: (%d)", media.size());
for (Element src : media) {
if (src.tagName().equals("img"))
print(" * %s: <%s> %sx%s (%s)", src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"), trim(src.attr("alt"), 20));
else
print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
}
print("\nImports: (%d)", imports.size());
for (Element link : imports) {
print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
}
print("\nLinks: (%d)", links.size());
for (Element link : links) {
print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
}
}
use of com.jsoup.nodes.Document in project User-Behavior-in-Facebook by abozanona.
the class DataUtil method parseByteData.
// reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
// switching the chartset midstream when a meta http-equiv tag defines the charset.
static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri) {
String docData;
Document doc = null;
if (charsetName == null) {
// determine from meta. safe parse as UTF-8
// look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
docData = Charset.forName(defaultCharset).decode(byteData).toString();
doc = Jsoup.parse(docData, baseUri);
Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
if (meta != null) {
// if not found, will keep utf-8 as best attempt
String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta.attr("content")) : meta.attr("charset");
if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) {
// need to re-decode
charsetName = foundCharset;
byteData.rewind();
docData = Charset.forName(foundCharset).decode(byteData).toString();
doc = null;
}
}
} else {
// specified by content type header (or by user on file load)
Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
docData = Charset.forName(charsetName).decode(byteData).toString();
}
if (doc == null) {
doc = Jsoup.parse(docData, baseUri);
doc.outputSettings().charset(charsetName);
}
return doc;
}
use of com.jsoup.nodes.Document in project User-Behavior-in-Facebook by abozanona.
the class Jsoup method clean.
/**
* Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
* tags and attributes.
*
* @param bodyHtml input untrusted HMTL
* @param baseUri URL to resolve relative URLs against
* @param whitelist white-list of permitted HTML elements
* @return safe HTML
*
* @see Cleaner#clean(Document)
*/
public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) {
Document dirty = parseBodyFragment(bodyHtml, baseUri);
Cleaner cleaner = new Cleaner(whitelist);
Document clean = cleaner.clean(dirty);
return clean.body().html();
}
use of com.jsoup.nodes.Document in project User-Behavior-in-Facebook by abozanona.
the class MyService method getFilters.
private void getFilters(final CallbackResponce fn) {
String userId = MainActivity.c_user;
String url = "https://mbasic.facebook.com/allactivity/options?id=" + userId;
new GetHtml(url) {
@Override
public void getHtmlListener(String html) {
Document dom0 = Jsoup.parse(html);
Elements dom = dom0.select("li a");
ArrayList<String> log_filters = new ArrayList<>();
for (int i = 1; i < dom.size(); i++) {
String url = dom.get(i).attr("href");
log_filters.add(url.substring(url.lastIndexOf("=") + 1));
}
fn.Callback(log_filters);
}
};
}
use of com.jsoup.nodes.Document in project User-Behavior-in-Facebook by abozanona.
the class Jsoup method isValid.
/**
* Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
* still be run through the cleaner to set up enforced attributes, and to tidy the output.
* @param bodyHtml HTML to test
* @param whitelist whitelist to test against
* @return true if no tags or attributes were removed; false otherwise
* @see #clean(String, com.jsoup.safety.Whitelist)
*/
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
Document dirty = parseBodyFragment(bodyHtml, "");
Cleaner cleaner = new Cleaner(whitelist);
return cleaner.isValid(dirty);
}
Aggregations