use of org.codelibs.nekohtml.parsers.DOMParser in project fess by codelibs.
the class FessXpathTransformer method storeData.
@Override
protected void storeData(final ResponseData responseData, final ResultData resultData) {
final DOMParser parser = getDomParser();
try (final BufferedInputStream bis = new BufferedInputStream(responseData.getResponseBody())) {
final byte[] bomBytes = new byte[UTF8_BOM_SIZE];
bis.mark(UTF8_BOM_SIZE);
final int size = bis.read(bomBytes);
if (size < 3 || !isUtf8BomBytes(bomBytes)) {
bis.reset();
}
final InputSource is = new InputSource(bis);
if (responseData.getCharSet() != null) {
is.setEncoding(responseData.getCharSet());
}
parser.parse(is);
} catch (final Exception e) {
throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
}
final Document document = parser.getDocument();
processMetaRobots(responseData, resultData, document);
processXRobotsTag(responseData, resultData);
final Map<String, Object> dataMap = new LinkedHashMap<>();
for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
final String path = entry.getValue();
try {
final XObject xObj = getXPathAPI().eval(document, path);
final int type = xObj.getType();
switch(type) {
case XObject.CLASS_BOOLEAN:
final boolean b = xObj.bool();
putResultDataBody(dataMap, entry.getKey(), Boolean.toString(b));
break;
case XObject.CLASS_NUMBER:
final double d = xObj.num();
putResultDataBody(dataMap, entry.getKey(), Double.toString(d));
break;
case XObject.CLASS_STRING:
final String str = xObj.str();
putResultDataBody(dataMap, entry.getKey(), str);
break;
case XObject.CLASS_NULL:
case XObject.CLASS_UNKNOWN:
case XObject.CLASS_NODESET:
case XObject.CLASS_RTREEFRAG:
case XObject.CLASS_UNRESOLVEDVARIABLE:
default:
final Boolean isPruned = fieldPrunedRuleMap.get(entry.getKey());
Node value = getXPathAPI().selectSingleNode(document, entry.getValue());
if (value != null && isPruned != null && isPruned.booleanValue()) {
value = pruneNode(value);
}
putResultDataBody(dataMap, entry.getKey(), value != null ? value.getTextContent() : null);
break;
}
} catch (final TransformerException e) {
logger.warn("Could not parse a value of {}:{}", entry.getKey(), entry.getValue(), e);
}
}
putAdditionalData(dataMap, responseData, document);
normalizeData(responseData, dataMap);
try {
resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
} catch (final Exception e) {
throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
}
resultData.setEncoding(charsetName);
}
use of org.codelibs.nekohtml.parsers.DOMParser in project fess by codelibs.
the class FessXpathTransformerTest method getDocument.
private Document getDocument(final String data) throws Exception {
final DOMParser parser = new DOMParser();
final ByteArrayInputStream is = new ByteArrayInputStream(data.getBytes("UTF-8"));
parser.parse(new InputSource(is));
return parser.getDocument();
}
use of org.codelibs.nekohtml.parsers.DOMParser in project fess by codelibs.
the class FessPropTest method matchesTag.
private boolean matchesTag(final PrunedTag tag, final String text) throws Exception {
final DOMParser parser = new DOMParser();
final String html = "<html><body>" + text + "</body></html>";
final ByteArrayInputStream is = new ByteArrayInputStream(html.getBytes("UTF-8"));
parser.parse(new InputSource(is));
Node node = parser.getDocument().getFirstChild().getLastChild().getFirstChild();
return tag.matches(node);
}
Aggregations