use of org.apache.any23.writer.TripleHandlerException in project nutch by apache.
the class Any23ParseFilter method filter.
/**
* @see org.apache.nutch.parse.HtmlParseFilter#filter(Content, ParseResult, HTMLMetaTags, DocumentFragment)
*/
@Override
public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
String[] extractorNames = conf.getStrings(ANY_23_EXTRACTORS_CONF, "html-head-meta");
String[] supportedContentTypes = conf.getStrings(ANY_23_CONTENT_TYPES_CONF, "text/html", "application/xhtml+xml");
String contentType = content.getContentType();
if (supportedContentTypes != null && !Arrays.asList(supportedContentTypes).contains(contentType)) {
LOG.debug("Ignoring document at {} because it has an unsupported Content-Type {}", content.getUrl(), contentType);
return parseResult;
}
Any23Parser parser;
try {
String htmlContent = new String(content.getContent(), Charset.forName("UTF-8"));
parser = new Any23Parser(content.getUrl(), htmlContent, contentType, extractorNames);
} catch (TripleHandlerException e) {
throw new RuntimeException("Error running Any23 parser: " + e.getMessage());
}
Set<String> triples = parser.getTriples();
Parse parse = parseResult.get(content.getUrl());
Metadata metadata = parse.getData().getParseMeta();
for (String triple : triples) {
metadata.add(ANY23_TRIPLES, triple);
}
return parseResult;
}
Aggregations