Search in sources :

Example 1 with Document

use of com.smartandroid.sa.tag.nodes.Document in project SmartAndroidSource by jaychou2012.

the class DataUtil method parseByteData.

// reads bytes first into a buffer, then decodes with the appropriate
// charset. done this way to support
// switching the chartset midstream when a meta http-equiv tag defines the
// charset.
// todo - this is getting gnarly. needs a rewrite.
static Document parseByteData(ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
    String docData;
    Document doc = null;
    if (charsetName == null) {
        // determine from meta. safe parse as UTF-8
        // look for <meta http-equiv="Content-Type"
        // content="text/html;charset=gb2312"> or HTML5 <meta
        // charset="gb2312">
        docData = Charset.forName(defaultCharset).decode(byteData).toString();
        doc = parser.parseInput(docData, baseUri);
        Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
        if (meta != null) {
            // if not found, will keep utf-8 as best attempt
            String foundCharset;
            if (meta.hasAttr("http-equiv")) {
                foundCharset = getCharsetFromContentType(meta.attr("content"));
                if (foundCharset == null && meta.hasAttr("charset")) {
                    try {
                        if (Charset.isSupported(meta.attr("charset"))) {
                            foundCharset = meta.attr("charset");
                        }
                    } catch (IllegalCharsetNameException e) {
                        foundCharset = null;
                    }
                }
            } else {
                foundCharset = meta.attr("charset");
            }
            if (foundCharset != null && foundCharset.length() != 0 && !foundCharset.equals(defaultCharset)) {
                // need to
                // re-decode
                foundCharset = foundCharset.trim().replaceAll("[\"']", "");
                charsetName = foundCharset;
                byteData.rewind();
                docData = Charset.forName(foundCharset).decode(byteData).toString();
                doc = null;
            }
        }
    } else {
        // specified by content type header (or by user on file load)
        Validate.notEmpty(charsetName, "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
        docData = Charset.forName(charsetName).decode(byteData).toString();
    }
    // used. re-decodes incase above decoded incorrectly
    if (docData.length() > 0 && docData.charAt(0) == 65279) {
        byteData.rewind();
        docData = Charset.forName(defaultCharset).decode(byteData).toString();
        docData = docData.substring(1);
        charsetName = defaultCharset;
        doc = null;
    }
    if (doc == null) {
        doc = parser.parseInput(docData, baseUri);
        doc.outputSettings().charset(charsetName);
    }
    return doc;
}
Also used : IllegalCharsetNameException(java.nio.charset.IllegalCharsetNameException) Element(com.smartandroid.sa.tag.nodes.Element) Document(com.smartandroid.sa.tag.nodes.Document)

Example 2 with Document

use of com.smartandroid.sa.tag.nodes.Document in project SmartAndroidSource by jaychou2012.

the class SmartTag method clean.

/**
     Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of permitted
     tags and attributes.

     @param bodyHtml  input untrusted HTML (body fragment)
     @param baseUri   URL to resolve relative URLs against
     @param whitelist white-list of permitted HTML elements
     @return safe HTML (body fragment)

     @see Cleaner#clean(Document)
     */
public static String clean(String bodyHtml, String baseUri, Whitelist whitelist) {
    Document dirty = parseBodyFragment(bodyHtml, baseUri);
    Cleaner cleaner = new Cleaner(whitelist);
    Document clean = cleaner.clean(dirty);
    return clean.body().html();
}
Also used : Document(com.smartandroid.sa.tag.nodes.Document) Cleaner(com.smartandroid.sa.tag.safety.Cleaner)

Example 3 with Document

use of com.smartandroid.sa.tag.nodes.Document in project SmartAndroidSource by jaychou2012.

the class Parser method parseBodyFragment.

/**
	 * Parse a fragment of HTML into the {@code body} of a Document.
	 * 
	 * @param bodyHtml
	 *            fragment of HTML
	 * @param baseUri
	 *            base URI of document (i.e. original fetch location), for
	 *            resolving relative URLs.
	 * 
	 * @return Document, with empty head, and HTML parsed into body
	 */
public static Document parseBodyFragment(String bodyHtml, String baseUri) {
    Document doc = Document.createShell(baseUri);
    Element body = doc.body();
    List<Node> nodeList = parseFragment(bodyHtml, body, baseUri);
    // the node
    Node[] nodes = nodeList.toArray(new Node[nodeList.size()]);
    // re-parented
    for (Node node : nodes) {
        body.appendChild(node);
    }
    return doc;
}
Also used : Element(com.smartandroid.sa.tag.nodes.Element) Node(com.smartandroid.sa.tag.nodes.Node) Document(com.smartandroid.sa.tag.nodes.Document)

Example 4 with Document

use of com.smartandroid.sa.tag.nodes.Document in project SmartAndroidSource by jaychou2012.

the class SmartTag method clean.

/**
     * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a white-list of
     * permitted
     * tags and attributes.
     *
     * @param bodyHtml input untrusted HTML (body fragment)
     * @param baseUri URL to resolve relative URLs against
     * @param whitelist white-list of permitted HTML elements
     * @param outputSettings document output settings; use to control pretty-printing and entity escape modes
     * @return safe HTML (body fragment)
     * @see Cleaner#clean(Document)
     */
public static String clean(String bodyHtml, String baseUri, Whitelist whitelist, Document.OutputSettings outputSettings) {
    Document dirty = parseBodyFragment(bodyHtml, baseUri);
    Cleaner cleaner = new Cleaner(whitelist);
    Document clean = cleaner.clean(dirty);
    clean.outputSettings(outputSettings);
    return clean.body().html();
}
Also used : Document(com.smartandroid.sa.tag.nodes.Document) Cleaner(com.smartandroid.sa.tag.safety.Cleaner)

Example 5 with Document

use of com.smartandroid.sa.tag.nodes.Document in project SmartAndroidSource by jaychou2012.

the class SmartTag method isValid.

/**
     Test if the input HTML has only tags and attributes allowed by the Whitelist. Useful for form validation. The input HTML should
     still be run through the cleaner to set up enforced attributes, and to tidy the output.
     @param bodyHtml HTML to test
     @param whitelist whitelist to test against
     @return true if no tags or attributes were removed; false otherwise
     @see #clean(String, org.jsoup.safety.Whitelist) 
     */
public static boolean isValid(String bodyHtml, Whitelist whitelist) {
    Document dirty = parseBodyFragment(bodyHtml, "");
    Cleaner cleaner = new Cleaner(whitelist);
    return cleaner.isValid(dirty);
}
Also used : Document(com.smartandroid.sa.tag.nodes.Document) Cleaner(com.smartandroid.sa.tag.safety.Cleaner)

Aggregations

Document (com.smartandroid.sa.tag.nodes.Document)11 Element (com.smartandroid.sa.tag.nodes.Element)4 Cleaner (com.smartandroid.sa.tag.safety.Cleaner)3 Node (com.smartandroid.sa.tag.nodes.Node)1 Elements (com.smartandroid.sa.tag.select.Elements)1 IllegalCharsetNameException (java.nio.charset.IllegalCharsetNameException)1