use of org.apache.nutch.parse.Parser in project nutch by apache.
the class FeedParser method addToMap.
private void addToMap(ParseResult parseResult, SyndFeed feed, String feedLink, SyndEntry entry, Content content) {
String link = entry.getLink(), text = null, title = null;
Metadata parseMeta = new Metadata(), contentMeta = content.getMetadata();
Parse parse = null;
SyndContent description = entry.getDescription();
try {
link = normalizers.normalize(link, URLNormalizers.SCOPE_OUTLINK);
if (link != null)
link = filters.filter(link);
} catch (Exception e) {
e.printStackTrace();
return;
}
if (link == null)
return;
title = stripTags(entry.getTitleEx());
if (feedLink != null)
parseMeta.set("feed", feedLink);
addFields(parseMeta, contentMeta, feed, entry);
// some item descriptions contain markup text in them,
// so we temporarily set their content-type to parse them
// with another plugin
String contentType = contentMeta.get(Response.CONTENT_TYPE);
if (description != null)
text = description.getValue();
if (text == null) {
List<?> contents = entry.getContents();
StringBuilder buf = new StringBuilder();
for (Object syndContent : contents) {
buf.append(((SyndContent) syndContent).getValue());
}
text = buf.toString();
}
try {
Parser parser = parserFactory.getParsers(contentType, link)[0];
parse = parser.getParse(new Content(link, link, text.getBytes(), contentType, contentMeta, conf)).get(link);
} catch (ParserNotFound e) {
/* ignore */
}
if (parse != null) {
ParseData data = parse.getData();
data.getContentMeta().remove(Response.CONTENT_TYPE);
mergeMetadata(data.getParseMeta(), parseMeta);
parseResult.put(link, new ParseText(parse.getText()), new ParseData(ParseStatus.STATUS_SUCCESS, title, data.getOutlinks(), data.getContentMeta(), data.getParseMeta()));
} else {
contentMeta.remove(Response.CONTENT_TYPE);
parseResult.put(link, new ParseText(text), new ParseData(ParseStatus.STATUS_FAILURE, title, new Outlink[0], contentMeta, parseMeta));
}
}
Aggregations