use of org.apache.xerces.xni.parser.XMLParserConfiguration in project zm-mailbox by Zimbra.
the class HtmlDetag method detag.
public String detag(String html) {
StringWriter out = new StringWriter();
UnescapeWriter writer = new UnescapeWriter(out, "utf-8");
XMLDocumentFilter[] filters = { this, writer };
XMLParserConfiguration parser = new HTMLConfiguration();
parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
parser.setFeature("http://cyberneko.org/html/features/balance-tags", false);
parser.setFeature("http://xml.org/sax/features/namespaces", false);
XMLInputSource source = new XMLInputSource(null, null, null, new StringReader(html), null);
try {
parser.parse(source);
} catch (Exception x) {
ZimbraLog.misc.warn("Can't detag HTML [" + html + "]");
}
// return whatever has been done
return out.toString();
}
use of org.apache.xerces.xni.parser.XMLParserConfiguration in project zm-mailbox by Zimbra.
the class HtmlDefang method defang.
/**
* @param source HTML source
* @param neuterImages <tt>true</tt> to remove images
* @param maxChars maximum number of characters to return, or <tt><=0</tt> for no limit
*/
protected void defang(XMLInputSource source, boolean neuterImages, Writer out) throws IOException {
// create writer filter
// TODO: uft-8 right?
/*
org.cyberneko.html.filters.Writer writer =
new org.cyberneko.html.filters.Writer(out, "utf-8");
*/
DefangWriter writer = new DefangWriter(out, "utf-8");
DefangFilter defang = new DefangFilter(neuterImages);
Purifier purifier = new HtmlPurifier();
// setup filter chain
XMLDocumentFilter[] filters = { purifier, defang, writer };
// create HTML parser
XMLParserConfiguration parser = new HTMLConfiguration();
parser.setProperty("http://cyberneko.org/html/properties/filters", filters);
parser.setProperty("http://cyberneko.org/html/properties/names/elems", "match");
parser.setFeature("http://cyberneko.org/html/features/balance-tags", false);
parser.setFeature("http://xml.org/sax/features/namespaces", false);
// parse document
parser.parse(source);
}
use of org.apache.xerces.xni.parser.XMLParserConfiguration in project gocd by gocd.
the class HtmlDomParserContext method initParser.
@Override
protected void initParser(Ruby runtime) {
XMLParserConfiguration config = new HTMLConfiguration();
XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
XMLDocumentFilter elementValidityCheckFilter = new ElementValidityCheckFilter(errorHandler);
// XMLDocumentFilter[] filters = { removeNSAttrsFilter, elementValidityCheckFilter};
XMLDocumentFilter[] filters = { elementValidityCheckFilter };
config.setErrorHandler(this.errorHandler);
parser = new NokogiriDomParser(config);
// see http://nekohtml.sourceforge.net/settings.html for details
setProperty("http://cyberneko.org/html/properties/default-encoding", java_encoding);
setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
setProperty("http://cyberneko.org/html/properties/filters", filters);
setFeature("http://cyberneko.org/html/features/report-errors", true);
setFeature("http://xml.org/sax/features/namespaces", false);
}
use of org.apache.xerces.xni.parser.XMLParserConfiguration in project gwt-test-utils by gwt-test-utils.
the class Writer method main.
//
// Data
//
/**
* Main.
*/
public static void main(String[] argv) throws Exception {
if (argv.length == 0) {
printUsage();
System.exit(1);
}
XMLParserConfiguration parser = new HTMLConfiguration();
parser.setFeature(NOTIFY_CHAR_REFS, true);
parser.setFeature(NOTIFY_HTML_BUILTIN_REFS, true);
String iencoding = null;
String oencoding = "Windows-1252";
boolean identity = false;
boolean purify = false;
for (int i = 0; i < argv.length; i++) {
String arg = argv[i];
if (arg.equals("-ie")) {
iencoding = argv[++i];
continue;
}
if (arg.equals("-e") || arg.equals("-oe")) {
oencoding = argv[++i];
continue;
}
if (arg.equals("-i")) {
identity = true;
continue;
}
if (arg.equals("-p")) {
purify = true;
continue;
}
if (arg.equals("-h")) {
printUsage();
System.exit(1);
}
java.util.Vector filtersVector = new java.util.Vector(2);
if (identity) {
filtersVector.addElement(new Identity());
} else if (purify) {
filtersVector.addElement(new Purifier());
}
filtersVector.addElement(new Writer(System.out, oencoding));
XMLDocumentFilter[] filters = new XMLDocumentFilter[filtersVector.size()];
filtersVector.copyInto(filters);
parser.setProperty(FILTERS, filters);
XMLInputSource source = new XMLInputSource(null, arg, null);
source.setEncoding(iencoding);
parser.parse(source);
}
}
use of org.apache.xerces.xni.parser.XMLParserConfiguration in project nokogiri by sparklemotion.
the class HtmlDomParserContext method initParser.
@Override
protected void initParser(Ruby runtime) {
XMLParserConfiguration config = new HTMLConfiguration();
// XMLDocumentFilter removeNSAttrsFilter = new RemoveNSAttrsFilter();
XMLDocumentFilter elementValidityCheckFilter = new ElementValidityCheckFilter(errorHandler);
// XMLDocumentFilter[] filters = { removeNSAttrsFilter, elementValidityCheckFilter};
XMLDocumentFilter[] filters = { elementValidityCheckFilter };
config.setErrorHandler(this.errorHandler);
parser = new NokogiriDomParser(config);
// see http://nekohtml.sourceforge.net/settings.html for details
setProperty("http://cyberneko.org/html/properties/default-encoding", java_encoding);
setProperty("http://cyberneko.org/html/properties/names/elems", "lower");
setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
setProperty("http://cyberneko.org/html/properties/filters", filters);
setFeature("http://cyberneko.org/html/features/report-errors", true);
setFeature("http://xml.org/sax/features/namespaces", false);
}
Aggregations