Search in sources :

Example 1 with XMLParserConfiguration

use of org.apache.xerces.xni.parser.XMLParserConfiguration in project zm-mailbox by Zimbra.

the class HtmlDetag method detag.

public String detag(String html) {
    StringWriter out = new StringWriter();
    UnescapeWriter writer = new UnescapeWriter(out, "utf-8");
    XMLDocumentFilter[] filters = { this, writer };
    XMLParserConfiguration parser = new HTMLConfiguration();
    parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
    parser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
    parser.setFeature("http://cyberneko.org/html/features/balance-tags", false);
    parser.setFeature("http://xml.org/sax/features/namespaces", false);
    XMLInputSource source = new XMLInputSource(null, null, null, new StringReader(html), null);
    try {
        parser.parse(source);
    } catch (Exception x) {
        ZimbraLog.misc.warn("Can't detag HTML [" + html + "]");
    }
    // return whatever has been done
    return out.toString();
}
Also used : StringWriter(java.io.StringWriter) XMLInputSource(org.apache.xerces.xni.parser.XMLInputSource) HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) StringReader(java.io.StringReader) XMLDocumentFilter(org.apache.xerces.xni.parser.XMLDocumentFilter) XMLParserConfiguration(org.apache.xerces.xni.parser.XMLParserConfiguration) IOException(java.io.IOException) XNIException(org.apache.xerces.xni.XNIException)

Example 2 with XMLParserConfiguration

use of org.apache.xerces.xni.parser.XMLParserConfiguration in project zm-mailbox by Zimbra.

the class HtmlDefang method defang.

/**
 * @param source HTML source
 * @param neuterImages <tt>true</tt> to remove images
 * @param maxChars maximum number of characters to return, or <tt><=0</tt> for no limit
 */
protected void defang(XMLInputSource source, boolean neuterImages, Writer out) throws IOException {
    // create writer filter
    // TODO: uft-8 right?
    /*
        org.cyberneko.html.filters.Writer writer =
            new org.cyberneko.html.filters.Writer(out, "utf-8");
            */
    DefangWriter writer = new DefangWriter(out, "utf-8");
    DefangFilter defang = new DefangFilter(neuterImages);
    Purifier purifier = new HtmlPurifier();
    // setup filter chain
    XMLDocumentFilter[] filters = { purifier, defang, writer };
    // create HTML parser
    XMLParserConfiguration parser = new HTMLConfiguration();
    parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
    parser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
    parser.setFeature("http://cyberneko.org/html/features/balance-tags", false);
    parser.setFeature("http://xml.org/sax/features/namespaces", false);
    // parse document
    parser.parse(source);
}
Also used : HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) XMLDocumentFilter(org.apache.xerces.xni.parser.XMLDocumentFilter) Purifier(org.cyberneko.html.filters.Purifier) XMLParserConfiguration(org.apache.xerces.xni.parser.XMLParserConfiguration)

Example 3 with XMLParserConfiguration

use of org.apache.xerces.xni.parser.XMLParserConfiguration in project gocd by gocd.

the class HtmlDomParserContext method initParser.

@Override
protected void initParser(Ruby runtime) {
    XMLParserConfiguration config = new HTMLConfiguration();
    XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
    XMLDocumentFilter elementValidityCheckFilter = new ElementValidityCheckFilter(errorHandler);
    // XMLDocumentFilter[] filters = { removeNSAttrsFilter,  elementValidityCheckFilter};
    XMLDocumentFilter[] filters = { elementValidityCheckFilter };
    config.setErrorHandler(this.errorHandler);
    parser = new NokogiriDomParser(config);
    // see http://nekohtml.sourceforge.net/settings.html for details
    setProperty("http://cyberneko.org/html/properties/default-encoding", java_encoding);
    setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
    setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
    setProperty("http://cyberneko.org/html/properties/filters", filters);
    setFeature("http://cyberneko.org/html/features/report-errors", true);
    setFeature("http://xml.org/sax/features/namespaces", false);
}
Also used : HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) XMLDocumentFilter(org.apache.xerces.xni.parser.XMLDocumentFilter) XMLParserConfiguration(org.apache.xerces.xni.parser.XMLParserConfiguration)

Example 4 with XMLParserConfiguration

use of org.apache.xerces.xni.parser.XMLParserConfiguration in project gwt-test-utils by gwt-test-utils.

the class Writer method main.

// 
// Data
// 
/**
 * Main.
 */
public static void main(String[] argv) throws Exception {
    if (argv.length == 0) {
        printUsage();
        System.exit(1);
    }
    XMLParserConfiguration parser = new HTMLConfiguration();
    parser.setFeature(NOTIFY_CHAR_REFS, true);
    parser.setFeature(NOTIFY_HTML_BUILTIN_REFS, true);
    String iencoding = null;
    String oencoding = "Windows-1252";
    boolean identity = false;
    boolean purify = false;
    for (int i = 0; i < argv.length; i++) {
        String arg = argv[i];
        if (arg.equals("-ie")) {
            iencoding = argv[++i];
            continue;
        }
        if (arg.equals("-e") || arg.equals("-oe")) {
            oencoding = argv[++i];
            continue;
        }
        if (arg.equals("-i")) {
            identity = true;
            continue;
        }
        if (arg.equals("-p")) {
            purify = true;
            continue;
        }
        if (arg.equals("-h")) {
            printUsage();
            System.exit(1);
        }
        java.util.Vector filtersVector = new java.util.Vector(2);
        if (identity) {
            filtersVector.addElement(new Identity());
        } else if (purify) {
            filtersVector.addElement(new Purifier());
        }
        filtersVector.addElement(new Writer(System.out, oencoding));
        XMLDocumentFilter[] filters = new XMLDocumentFilter[filtersVector.size()];
        filtersVector.copyInto(filters);
        parser.setProperty(FILTERS, filters);
        XMLInputSource source = new XMLInputSource(null, arg, null);
        source.setEncoding(iencoding);
        parser.parse(source);
    }
}
Also used : XMLInputSource(org.apache.xerces.xni.parser.XMLInputSource) HTMLConfiguration(com.googlecode.html.HTMLConfiguration) XMLDocumentFilter(org.apache.xerces.xni.parser.XMLDocumentFilter) XMLParserConfiguration(org.apache.xerces.xni.parser.XMLParserConfiguration) PrintWriter(java.io.PrintWriter) OutputStreamWriter(java.io.OutputStreamWriter)

Example 5 with XMLParserConfiguration

use of org.apache.xerces.xni.parser.XMLParserConfiguration in project nokogiri by sparklemotion.

the class HtmlDomParserContext method initParser.

@Override
protected void initParser(Ruby runtime) {
    XMLParserConfiguration config = new HTMLConfiguration();
    // XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
    XMLDocumentFilter elementValidityCheckFilter = new ElementValidityCheckFilter(errorHandler);
    // XMLDocumentFilter[] filters = { removeNSAttrsFilter,  elementValidityCheckFilter};
    XMLDocumentFilter[] filters = { elementValidityCheckFilter };
    config.setErrorHandler(this.errorHandler);
    parser = new NokogiriDomParser(config);
    // see http://nekohtml.sourceforge.net/settings.html for details
    setProperty("http://cyberneko.org/html/properties/default-encoding", java_encoding);
    setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
    setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
    setProperty("http://cyberneko.org/html/properties/filters", filters);
    setFeature("http://cyberneko.org/html/features/report-errors", true);
    setFeature("http://xml.org/sax/features/namespaces", false);
}
Also used : HTMLConfiguration(org.cyberneko.html.HTMLConfiguration) XMLDocumentFilter(org.apache.xerces.xni.parser.XMLDocumentFilter) XMLParserConfiguration(org.apache.xerces.xni.parser.XMLParserConfiguration)

Aggregations

XMLDocumentFilter (org.apache.xerces.xni.parser.XMLDocumentFilter)5 XMLParserConfiguration (org.apache.xerces.xni.parser.XMLParserConfiguration)5 HTMLConfiguration (org.cyberneko.html.HTMLConfiguration)4 XMLInputSource (org.apache.xerces.xni.parser.XMLInputSource)2 HTMLConfiguration (com.googlecode.html.HTMLConfiguration)1 IOException (java.io.IOException)1 OutputStreamWriter (java.io.OutputStreamWriter)1 PrintWriter (java.io.PrintWriter)1 StringReader (java.io.StringReader)1 StringWriter (java.io.StringWriter)1 XNIException (org.apache.xerces.xni.XNIException)1 Purifier (org.cyberneko.html.filters.Purifier)1