Search in sources :

Example 1 with Element

use of com.jsoup.nodes.Element in project User-Behavior-in-Facebook by abozanona.

the class Selector method select.

/**
 * Find elements matching selector.
 *
 * @param query CSS selector
 * @param roots root elements to descend into
 * @return matching elements, empty if not
 */
public static Elements select(String query, Iterable<Element> roots) {
    Validate.notEmpty(query);
    Validate.notNull(roots);
    LinkedHashSet<Element> elements = new LinkedHashSet<Element>();
    for (Element root : roots) {
        elements.addAll(select(query, root));
    }
    return new Elements(elements);
}
Also used : LinkedHashSet(java.util.LinkedHashSet) Element(com.jsoup.nodes.Element)

Example 2 with Element

use of com.jsoup.nodes.Element in project User-Behavior-in-Facebook by abozanona.

the class ListLinks method main.

public static void main(String[] args) throws IOException {
    Validate.isTrue(args.length == 1, "usage: supply url to fetch");
    String url = args[0];
    print("Fetching %s...", url);
    Document doc = Jsoup.connect(url).get();
    Elements links = doc.select("a[href]");
    Elements media = doc.select("[src]");
    Elements imports = doc.select("link[href]");
    print("\nMedia: (%d)", media.size());
    for (Element src : media) {
        if (src.tagName().equals("img"))
            print(" * %s: <%s> %sx%s (%s)", src.tagName(), src.attr("abs:src"), src.attr("width"), src.attr("height"), trim(src.attr("alt"), 20));
        else
            print(" * %s: <%s>", src.tagName(), src.attr("abs:src"));
    }
    print("\nImports: (%d)", imports.size());
    for (Element link : imports) {
        print(" * %s <%s> (%s)", link.tagName(), link.attr("abs:href"), link.attr("rel"));
    }
    print("\nLinks: (%d)", links.size());
    for (Element link : links) {
        print(" * a: <%s>  (%s)", link.attr("abs:href"), trim(link.text(), 35));
    }
}
Also used : Element(com.jsoup.nodes.Element) Document(com.jsoup.nodes.Document) Elements(com.jsoup.select.Elements)

Example 3 with Element

use of com.jsoup.nodes.Element in project User-Behavior-in-Facebook by abozanona.

the class DataUtil method parseByteData.

// reads bytes first into a buffer, then decodes with the appropriate charset. done this way to support
// switching the chartset midstream when a meta http-equiv tag defines the charset.
static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri) {
    String docData;
    Document doc = null;
    if (charsetName == null) {
        // determine from meta. safe parse as UTF-8
        // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta charset="gb2312">
        docData = Charset.forName(defaultCharset).decode(byteData).toString();
        doc = Jsoup.parse(docData, baseUri);
        Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
        if (meta != null) {
            // if not found, will keep utf-8 as best attempt
            String foundCharset = meta.hasAttr("http-equiv") ? getCharsetFromContentType(meta.attr("content")) : meta.attr("charset");
            if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) {
                // need to re-decode
                charsetName = foundCharset;
                byteData.rewind();
                docData = Charset.forName(foundCharset).decode(byteData).toString();
                doc = null;
            }
        }
    } else {
        // specified by content type header (or by user on file load)
        Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
        docData = Charset.forName(charsetName).decode(byteData).toString();
    }
    if (doc == null) {
        doc = Jsoup.parse(docData, baseUri);
        doc.outputSettings().charset(charsetName);
    }
    return doc;
}
Also used : Element(com.jsoup.nodes.Element) Document(com.jsoup.nodes.Document)

Example 4 with Element

use of com.jsoup.nodes.Element in project User-Behavior-in-Facebook by abozanona.

the class Parser method parseBodyFragment.

/**
 * Parse a fragment of HTML into the {@code body} of a Document.
 *
 * @param bodyHtml fragment of HTML
 * @param baseUri base URI of document (i.e. original fetch location), for resolving relative URLs.
 *
 * @return Document, with empty head, and HTML parsed into body
 */
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
    Document doc = Document.createShell(baseUri);
    Element body = doc.body();
    List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
    // the node list gets modified when re-parented
    Node[] nodes = nodeList.toArray(new Node[nodeList.size()]);
    for (Node node : nodes) {
        body.appendChild(node);
    }
    return doc;
}
Also used : Element(com.jsoup.nodes.Element) Node(com.jsoup.nodes.Node) Document(com.jsoup.nodes.Document)

Aggregations

Element (com.jsoup.nodes.Element)4 Document (com.jsoup.nodes.Document)3 Node (com.jsoup.nodes.Node)1 Elements (com.jsoup.select.Elements)1 LinkedHashSet (java.util.LinkedHashSet)1