use of org.cyberneko.html.parsers.DOMParser in project fess-crawler by codelibs.
the class XpathTransformer method storeData.
@Override
protected void storeData(final ResponseData responseData, final ResultData resultData) {
final DOMParser parser = getDomParser();
try (final InputStream in = responseData.getResponseBody()) {
final InputSource is = new InputSource(in);
if (responseData.getCharSet() != null) {
is.setEncoding(responseData.getCharSet());
}
parser.parse(is);
} catch (final Exception e) {
throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
}
final Document document = parser.getDocument();
final StringBuilder buf = new StringBuilder(1000);
buf.append(getResultDataHeader());
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
final String path = entry.getValue();
try {
final XObject xObj = getXPathAPI().eval(document, path);
final int type = xObj.getType();
switch(type) {
case XObject.CLASS_BOOLEAN:
final boolean b = xObj.bool();
buf.append(getResultDataBody(entry.getKey(), Boolean.toString(b)));
break;
case XObject.CLASS_NUMBER:
final double d = xObj.num();
buf.append(getResultDataBody(entry.getKey(), Double.toString(d)));
break;
case XObject.CLASS_STRING:
final String str = xObj.str();
buf.append(getResultDataBody(entry.getKey(), str.trim()));
break;
case XObject.CLASS_NODESET:
final NodeList nodeList = xObj.nodelist();
final List<String> strList = new ArrayList<>();
for (int i = 0; i < nodeList.getLength(); i++) {
final Node node = nodeList.item(i);
strList.add(node.getTextContent());
}
buf.append(getResultDataBody(entry.getKey(), strList));
break;
case XObject.CLASS_RTREEFRAG:
final int rtf = xObj.rtf();
buf.append(getResultDataBody(entry.getKey(), Integer.toString(rtf)));
break;
case XObject.CLASS_NULL:
case XObject.CLASS_UNKNOWN:
case XObject.CLASS_UNRESOLVEDVARIABLE:
default:
Object obj = xObj.object();
if (obj == null) {
obj = "";
}
buf.append(getResultDataBody(entry.getKey(), obj.toString()));
break;
}
} catch (final TransformerException e) {
logger.warn("Could not parse a value of " + entry.getKey() + ":" + entry.getValue());
}
}
buf.append(getAdditionalData(responseData, document));
buf.append(getResultDataFooter());
final String data = buf.toString().trim();
try {
resultData.setData(data.getBytes(charsetName));
} catch (final UnsupportedEncodingException e) {
if (logger.isInfoEnabled()) {
logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
}
charsetName = Constants.UTF_8_CHARSET.name();
resultData.setData(data.getBytes(Constants.UTF_8_CHARSET));
}
resultData.setEncoding(charsetName);
}
use of org.cyberneko.html.parsers.DOMParser in project fess-crawler by codelibs.
the class HtmlXpathExtractor method getText.
/*
* (non-Javadoc)
*
* @see org.codelibs.fess.crawler.extractor.Extractor#getText(java.io.InputStream,
* java.util.Map)
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
try {
final BufferedInputStream bis = new BufferedInputStream(in);
final String enc = getEncoding(bis);
final DOMParser parser = getDomParser();
final InputSource inputSource = new InputSource(bis);
inputSource.setEncoding(enc);
parser.parse(inputSource);
final Document document = parser.getDocument();
final StringBuilder buf = new StringBuilder(255);
final NodeList nodeList = getXPathAPI().selectNodeList(document, targetNodePath);
for (int i = 0; i < nodeList.getLength(); i++) {
final Node node = nodeList.item(i);
buf.append(node.getTextContent()).append(' ');
}
return new ExtractData(buf.toString().replaceAll("\\s+", " ").trim());
} catch (final Exception e) {
throw new ExtractException(e);
}
}
use of org.cyberneko.html.parsers.DOMParser in project zm-mailbox by Zimbra.
the class QuotedTextUtil method getOriginalHtmlContent.
/**
* Using the DOM structure of the message content, traverse node by node and
* if we find a node that is recognized as a separator, remove all
* subsequent elements
*
* @param text the message content
* @return original content if the quoted content was found otherwise the
* complete message content
*/
private String getOriginalHtmlContent(String text) {
ArrayList<Node> nodeList = new ArrayList<Node>();
Node previousNode = null, sepNode = null;
LineType previousType = null;
boolean done = false;
DOMParser parser = new DOMParser();
Document document;
Node htmlNode = null;
try {
parser.parse(new InputSource(new StringReader(text)));
document = parser.getDocument();
htmlNode = document.getFirstChild();
flatten(htmlNode, nodeList);
for (int i = 0; i < nodeList.size(); i++) {
Node currentNode = nodeList.get(i);
if (currentNode.getNodeType() == ELEMENT_NODE) {
currentNode.normalize();
}
String nodeName = currentNode.getNodeName() != null ? currentNode.getNodeName() : "";
String nodeValue = currentNode.getNodeValue() != null ? currentNode.getNodeValue() : "";
LineType type = checkNode(currentNode);
/*
* Check for a multi-element "wrote:" attribution (usually a
* combo of #text and A nodes), for example:
*
* On Feb 28, 2014, at 3:42 PM, Joe Smith <<a
* href="mailto:jsmith@zimbra.com"
* target="_blank">jsmith@zimbra.com</a>> wrote:
*
* If the current node is a #text with a date or "On ...", find
* #text nodes within the next ten nodes, concatenate them, and
* check the result.
*/
if (type == LineType.UNKNOWN && nodeName.equals("#text") && (MATCHER_ORIG_DATE.reset(nodeValue).matches() || MATCHER_ORIG_INTRO.reset(nodeValue).matches())) {
String value = nodeValue;
for (int j = 1; j < 10; j++) {
Node tempNode = nodeList.get(i + j);
if (tempNode != null && tempNode.getNodeName() != null && tempNode.getNodeName().equals("#text")) {
value += tempNode.getNodeValue();
if ("/:$/".matches(value)) {
type = getLineType(value.trim());
if (type == LineType.SEP_STRONG) {
i = i + j;
break;
}
}
}
}
}
if (type != null) {
// definite separator
if (type == LineType.SEP_STRONG || type == LineType.WROTE_STRONG) {
sepNode = currentNode;
done = true;
break;
}
// some sort of line followed by a header
if (type == LineType.HEADER && previousType == LineType.LINE) {
sepNode = previousNode;
done = true;
break;
}
previousNode = currentNode;
previousType = type;
}
}
if (sepNode != null) {
prune(sepNode, true);
}
if (done) {
String originalText = getHtml(document);
return (originalText == null || originalText.isEmpty()) ? text : originalText;
}
} catch (SAXException | IOException e) {
ZimbraLog.soap.warn("Exception while removing quoted text from html message", e);
}
return text;
}
use of org.cyberneko.html.parsers.DOMParser in project openolat by klemens.
the class QuoteAndTagFilter method filter.
/**
* @see org.olat.core.util.filter.Filter#filter(java.lang.String)
*/
@Override
public String filter(String original) {
try {
DOMParser parser = new DOMParser();
parser.parse(new InputSource(new StringReader(original)));
Document document = parser.getDocument();
StringBuilder sb = new StringBuilder();
scanNode(document, sb);
return sb.toString();
} catch (SAXException e) {
log.error("", e);
return null;
} catch (IOException e) {
log.error("", e);
return null;
}
}
use of org.cyberneko.html.parsers.DOMParser in project intellij-community by JetBrains.
the class FindJarFix method initiateDownload.
private void initiateDownload(String url, String jarName) {
DOMParser parser = new DOMParser();
try {
parser.parse(url);
final Document doc = parser.getDocument();
if (doc != null) {
final NodeList links = doc.getElementsByTagName(LINK_TAG_NAME);
if (links != null) {
for (int i = 0; i < links.getLength(); i++) {
final Node item = links.item(i);
if (item != null) {
final NamedNodeMap attributes = item.getAttributes();
if (attributes != null) {
final Node link = attributes.getNamedItem(LINK_ATTR_NAME);
if (link != null) {
final String jarUrl = link.getTextContent();
if (jarUrl != null && jarUrl.endsWith(jarName)) {
downloadJar(jarUrl, jarName);
}
}
}
}
}
}
}
} catch (SAXException | IOException e) {
//
}
}
Aggregations