use of org.htmlcleaner.HtmlCleaner in project Ebselen by Ardesco.
the class IDEToEbselen method convertToXML.
/**
* Cleans the relevant file and generates a valid XML file ready for processing to Sel 2 java File.
*
* @param absoluteFilename - name of the file to convert.
* @return String - location of the converted file.
*/
public String convertToXML(String absoluteFilename) throws Exception {
FileHandler fromSelIDE = new FileHandler(absoluteFilename);
FileHandler toXML = new FileHandler(System.getProperty("java.io.tmpdir") + File.separator + fromSelIDE.getFileName() + ".xml", true);
if (fromSelIDE.getFile().isDirectory()) {
LOGGER.error("Cannot convert directory {} into a Selenium Test!", fromSelIDE.getFileName());
return null;
}
//Clean up html so that we can read it as XML properly
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties XMLPrefs = cleaner.getProperties();
XMLPrefs.setUseEmptyElementTags(true);
XMLPrefs.setTranslateSpecialEntities(true);
XMLPrefs.setTransResCharsToNCR(true);
XMLPrefs.setOmitComments(true);
XMLPrefs.setOmitComments(true);
XMLPrefs.setOmitDoctypeDeclaration(true);
XMLPrefs.setNamespacesAware(false);
TagNode tagNode = new HtmlCleaner(XMLPrefs).clean(fromSelIDE.getFile());
new PrettyXmlSerializer(XMLPrefs).writeToStream(tagNode, toXML.getWritableFileOutputStream(), "utf-8");
toXML.close();
return toXML.getAbsoluteFile();
}
use of org.htmlcleaner.HtmlCleaner in project k-9 by k9mail.
the class HtmlSignatureRemover method stripSignature.
public static String stripSignature(String content) {
Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
if (dashSignatureHtml.find()) {
Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
List<Integer> start = new ArrayList<>();
List<Integer> end = new ArrayList<>();
while (blockquoteStart.find()) {
start.add(blockquoteStart.start());
}
while (blockquoteEnd.find()) {
end.add(blockquoteEnd.start());
}
if (start.size() != end.size()) {
Timber.d("There are %d <blockquote> tags, but %d </blockquote> tags. Refusing to strip.", start.size(), end.size());
} else if (start.size() > 0) {
// Ignore quoted signatures in blockquotes.
dashSignatureHtml.region(0, start.get(0));
if (dashSignatureHtml.find()) {
// before first <blockquote>.
content = content.substring(0, dashSignatureHtml.start());
} else {
for (int i = 0; i < start.size() - 1; i++) {
// within blockquotes.
if (end.get(i) < start.get(i + 1)) {
dashSignatureHtml.region(end.get(i), start.get(i + 1));
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
break;
}
}
}
if (end.get(end.size() - 1) < content.length()) {
// after last </blockquote>.
dashSignatureHtml.region(end.get(end.size() - 1), content.length());
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
}
}
}
} else {
// No blockquotes found.
content = content.substring(0, dashSignatureHtml.start());
}
}
// Fix the stripping off of closing tags if a signature was stripped,
// as well as clean up the HTML of the quoted message.
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
// see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setOmitXmlDeclaration(true);
properties.setOmitDoctypeDeclaration(false);
properties.setTranslateSpecialEntities(false);
properties.setRecognizeUnicodeChars(false);
TagNode node = cleaner.clean(content);
SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
content = htmlSerialized.getAsString(node, "UTF8");
return content;
}
use of org.htmlcleaner.HtmlCleaner in project webmagic by code4craft.
the class Xpath2Selector method selectList.
@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Object result;
try {
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(document, XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
Transformer transformer = TransformerFactory.newInstance().newTransformer();
StreamResult xmlOutput = new StreamResult();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
for (int i = 0; i < nodeList.getLength(); i++) {
Node item = nodeList.item(i);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
results.add(item.getTextContent());
} else {
xmlOutput.setWriter(new StringWriter());
transformer.transform(new DOMSource(item), xmlOutput);
results.add(xmlOutput.getWriter().toString());
}
}
} else {
results.add(result.toString());
}
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return results;
}
use of org.htmlcleaner.HtmlCleaner in project webmagic by code4craft.
the class Xpath2Selector method select.
@Override
public String select(String text) {
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Object result;
try {
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(document, XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
if (nodeList.getLength() == 0) {
return null;
}
Node item = nodeList.item(0);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
return item.getTextContent();
} else {
StreamResult xmlOutput = new StreamResult(new StringWriter());
Transformer transformer = TransformerFactory.newInstance().newTransformer();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
transformer.transform(new DOMSource(item), xmlOutput);
return xmlOutput.getWriter().toString();
}
}
return result.toString();
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return null;
}
use of org.htmlcleaner.HtmlCleaner in project webmagic by code4craft.
the class XpathSelectorTest method parserPerformanceTest.
@Ignore("take long time")
@Test
public void parserPerformanceTest() throws XPatherException {
System.out.println(html.length());
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(html);
Document document = Jsoup.parse(html);
long time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
htmlCleaner.clean(html);
}
System.out.println(System.currentTimeMillis() - time);
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
tagNode.evaluateXPath("//a");
}
System.out.println(System.currentTimeMillis() - time);
System.out.println("=============");
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
Jsoup.parse(html);
}
System.out.println(System.currentTimeMillis() - time);
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
document.select("a");
}
System.out.println(System.currentTimeMillis() - time);
System.out.println("=============");
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
htmlCleaner.clean(html);
}
System.out.println(System.currentTimeMillis() - time);
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
tagNode.evaluateXPath("//a");
}
System.out.println(System.currentTimeMillis() - time);
System.out.println("=============");
XPathEvaluator compile = Xsoup.compile("//a");
time = System.currentTimeMillis();
for (int i = 0; i < 2000; i++) {
compile.evaluate(document);
}
System.out.println(System.currentTimeMillis() - time);
}
Aggregations