use of com.smartandroid.sa.tag.nodes.Document in project SmartAndroidSource by jaychou2012.
the class HtmlToPlainText method main.
public static void main(String... args) throws IOException {
Validate.isTrue(args.length == 1, "usage: supply url to fetch");
String url = args[0];
// fetch the specified URL and parse to a HTML DOM
Document doc = SmartTag.connect(url).get();
HtmlToPlainText formatter = new HtmlToPlainText();
String plainText = formatter.getPlainText(doc);
System.out.println(plainText);
}
use of com.smartandroid.sa.tag.nodes.Document in project SmartAndroidSource by jaychou2012.
the class ListLinks method main.
public static void main(String[] args) throws IOException {
Validate.isTrue(args.length == 1, "usage: supply url to fetch");
String url = args[0];
print("Fetching %s...", url);
Document doc = SmartTag.connect(url).get();
Elements links = doc.select("a[href]");
Elements media = doc.select("[src]");
Elements imports = doc.select("link[href]");
print("\nMedia: (%d)", media.size());
for (Element src : media) {
if (src.tagName().equals("img"))
print(" * %s: <%s> %sx%s (%s)", src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"), trim(src.attr("alt"), 20));
else
print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
}
print("\nImports: (%d)", imports.size());
for (Element link : imports) {
print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
}
print("\nLinks: (%d)", links.size());
for (Element link : links) {
print(" * a: <%s> (%s)", link.attr("abs:href"), trim(link.text(), 35));
}
}
use of com.smartandroid.sa.tag.nodes.Document in project SmartAndroidSource by jaychou2012.
the class Parser method parseInput.
public Document parseInput(String html, String baseUri) {
errors = isTrackErrors() ? ParseErrorList.tracking(maxErrors) : ParseErrorList.noTracking();
Document doc = treeBuilder.parse(html, baseUri, errors);
return doc;
}
use of com.smartandroid.sa.tag.nodes.Document in project SmartAndroidSource by jaychou2012.
the class TreeBuilder method initialiseParse.
protected void initialiseParse(String input, String baseUri, ParseErrorList errors) {
Validate.notNull(input, "String input must not be null");
Validate.notNull(baseUri, "BaseURI must not be null");
doc = new Document(baseUri);
reader = new CharacterReader(input);
this.errors = errors;
tokeniser = new Tokeniser(reader, errors);
stack = new DescendableLinkedList<Element>();
this.baseUri = baseUri;
}
use of com.smartandroid.sa.tag.nodes.Document in project SmartAndroidSource by jaychou2012.
the class Cleaner method isValid.
/**
* Determines if the input document is valid, against the whitelist. It is
* considered valid if all the tags and attributes in the input HTML are
* allowed by the whitelist.
* <p/>
* This method can be used as a validator for user input forms. An invalid
* document will still be cleaned successfully using the
* {@link #clean(Document)} document. If using as a validator, it is
* recommended to still clean the document to ensure enforced attributes are
* set correctly, and that the output is tidied.
*
* @param dirtyDocument
* document to test
* @return true if no tags or attributes need to be removed; false if they
* do
*/
public boolean isValid(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
return numDiscarded == 0;
}
Aggregations