use of org.htmlcleaner.TagNode in project Ebselen by Ardesco.
the class IDEToEbselen method convertToXML.
/**
* Cleans the relevant file and generates a valid XML file ready for processing to Sel 2 java File.
*
* @param absoluteFilename - name of the file to convert.
* @return String - location of the converted file.
*/
public String convertToXML(String absoluteFilename) throws Exception {
FileHandler fromSelIDE = new FileHandler(absoluteFilename);
FileHandler toXML = new FileHandler(System.getProperty("java.io.tmpdir") + File.separator + fromSelIDE.getFileName() + ".xml", true);
if (fromSelIDE.getFile().isDirectory()) {
LOGGER.error("Cannot convert directory {} into a Selenium Test!", fromSelIDE.getFileName());
return null;
}
//Clean up html so that we can read it as XML properly
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties XMLPrefs = cleaner.getProperties();
XMLPrefs.setUseEmptyElementTags(true);
XMLPrefs.setTranslateSpecialEntities(true);
XMLPrefs.setTransResCharsToNCR(true);
XMLPrefs.setOmitComments(true);
XMLPrefs.setOmitComments(true);
XMLPrefs.setOmitDoctypeDeclaration(true);
XMLPrefs.setNamespacesAware(false);
TagNode tagNode = new HtmlCleaner(XMLPrefs).clean(fromSelIDE.getFile());
new PrettyXmlSerializer(XMLPrefs).writeToStream(tagNode, toXML.getWritableFileOutputStream(), "utf-8");
toXML.close();
return toXML.getAbsoluteFile();
}
use of org.htmlcleaner.TagNode in project stanbol by apache.
the class DomSerializer2 method createSubnodes.
private void createSubnodes(Document document, Element element, List tagChildren) {
if (tagChildren != null) {
Iterator it = tagChildren.iterator();
while (it.hasNext()) {
Object item = it.next();
if (item instanceof CommentToken) {
CommentToken commentNode = (CommentToken) item;
Comment comment = document.createComment(commentNode.getContent().toString());
element.appendChild(comment);
} else if (item instanceof ContentToken) {
ContentToken contentToken = (ContentToken) item;
String content = contentToken.getContent();
String nodeName = element.getNodeName();
boolean specialCase = props.isUseCdataForScriptAndStyle() && ("script".equalsIgnoreCase(nodeName) || "style".equalsIgnoreCase(nodeName));
if (escapeXml && !specialCase) {
content = escapeXml(content, props, true);
}
element.appendChild(specialCase ? document.createCDATASection(content) : document.createTextNode(content));
} else if (item instanceof TagNode) {
TagNode subTagNode = (TagNode) item;
Element subelement = document.createElement(subTagNode.getName());
;
setAttributes(subTagNode, subelement);
// recursively create subnodes
createSubnodes(document, subelement, subTagNode.getChildren());
element.appendChild(subelement);
} else if (item instanceof List) {
List sublist = (List) item;
createSubnodes(document, element, sublist);
}
}
}
}
use of org.htmlcleaner.TagNode in project k-9 by k9mail.
the class HtmlSanitizer method sanitize.
public String sanitize(String html) {
TagNode rootNode = HTML_CLEANER.clean(html);
removeMetaRefresh(rootNode);
return HTML_SERIALIZER.getAsString(rootNode, "UTF8");
}
use of org.htmlcleaner.TagNode in project k-9 by k9mail.
the class HtmlSignatureRemover method stripSignature.
public static String stripSignature(String content) {
Matcher dashSignatureHtml = DASH_SIGNATURE_HTML.matcher(content);
if (dashSignatureHtml.find()) {
Matcher blockquoteStart = BLOCKQUOTE_START.matcher(content);
Matcher blockquoteEnd = BLOCKQUOTE_END.matcher(content);
List<Integer> start = new ArrayList<>();
List<Integer> end = new ArrayList<>();
while (blockquoteStart.find()) {
start.add(blockquoteStart.start());
}
while (blockquoteEnd.find()) {
end.add(blockquoteEnd.start());
}
if (start.size() != end.size()) {
Timber.d("There are %d <blockquote> tags, but %d </blockquote> tags. Refusing to strip.", start.size(), end.size());
} else if (start.size() > 0) {
// Ignore quoted signatures in blockquotes.
dashSignatureHtml.region(0, start.get(0));
if (dashSignatureHtml.find()) {
// before first <blockquote>.
content = content.substring(0, dashSignatureHtml.start());
} else {
for (int i = 0; i < start.size() - 1; i++) {
// within blockquotes.
if (end.get(i) < start.get(i + 1)) {
dashSignatureHtml.region(end.get(i), start.get(i + 1));
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
break;
}
}
}
if (end.get(end.size() - 1) < content.length()) {
// after last </blockquote>.
dashSignatureHtml.region(end.get(end.size() - 1), content.length());
if (dashSignatureHtml.find()) {
content = content.substring(0, dashSignatureHtml.start());
}
}
}
} else {
// No blockquotes found.
content = content.substring(0, dashSignatureHtml.start());
}
}
// Fix the stripping off of closing tags if a signature was stripped,
// as well as clean up the HTML of the quoted message.
HtmlCleaner cleaner = new HtmlCleaner();
CleanerProperties properties = cleaner.getProperties();
// see http://htmlcleaner.sourceforge.net/parameters.php for descriptions
properties.setNamespacesAware(false);
properties.setAdvancedXmlEscape(false);
properties.setOmitXmlDeclaration(true);
properties.setOmitDoctypeDeclaration(false);
properties.setTranslateSpecialEntities(false);
properties.setRecognizeUnicodeChars(false);
TagNode node = cleaner.clean(content);
SimpleHtmlSerializer htmlSerialized = new SimpleHtmlSerializer(properties);
content = htmlSerialized.getAsString(node, "UTF8");
return content;
}
use of org.htmlcleaner.TagNode in project webmagic by code4craft.
the class Xpath2Selector method selectList.
@Override
public List<String> selectList(String text) {
List<String> results = new ArrayList<String>();
try {
HtmlCleaner htmlCleaner = new HtmlCleaner();
TagNode tagNode = htmlCleaner.clean(text);
Document document = new DomSerializer(new CleanerProperties()).createDOM(tagNode);
Object result;
try {
result = xPathExpression.evaluate(document, XPathConstants.NODESET);
} catch (XPathExpressionException e) {
result = xPathExpression.evaluate(document, XPathConstants.STRING);
}
if (result instanceof NodeList) {
NodeList nodeList = (NodeList) result;
Transformer transformer = TransformerFactory.newInstance().newTransformer();
StreamResult xmlOutput = new StreamResult();
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
for (int i = 0; i < nodeList.getLength(); i++) {
Node item = nodeList.item(i);
if (item.getNodeType() == Node.ATTRIBUTE_NODE || item.getNodeType() == Node.TEXT_NODE) {
results.add(item.getTextContent());
} else {
xmlOutput.setWriter(new StringWriter());
transformer.transform(new DOMSource(item), xmlOutput);
results.add(xmlOutput.getWriter().toString());
}
}
} else {
results.add(result.toString());
}
} catch (Exception e) {
logger.error("select text error! " + xpathStr, e);
}
return results;
}
Aggregations