use of org.apache.stanbol.enhancer.engines.metaxa.core.html.HtmlTextExtractUtil in project stanbol by apache.
the class SimpleMailExtractor method extractTextFromHtml.
protected String extractTextFromHtml(String string, String charset, RDFContainer rdf) throws ExtractorException {
// parse the HTML and extract full-text and metadata
HtmlTextExtractUtil extractor;
try {
extractor = new HtmlTextExtractUtil();
} catch (InitializationException e) {
throw new ExtractorException("Could not initialize HtmlExtractor: " + e.getMessage());
}
InputStream stream = new ByteArrayInputStream(string.getBytes());
RDFContainerFactory containerFactory = new RDFContainerFactoryImpl();
URI id = rdf.getDescribedUri();
RDFContainer result = containerFactory.getRDFContainer(id);
extractor.extract(id, charset, stream, result);
Model meta = result.getModel();
// append metadata and full-text to a string buffer
StringBuilder buffer = new StringBuilder(32 * 1024);
append(buffer, extractor.getTitle(meta), "\n");
append(buffer, extractor.getAuthor(meta), "\n");
append(buffer, extractor.getDescription(meta), "\n");
List<String> keywords = extractor.getKeywords(meta);
for (String kw : keywords) {
append(buffer, kw, " ");
}
buffer.append("\n");
append(buffer, extractor.getText(meta), " ");
logger.debug("text extracted:\n{}", buffer);
meta.close();
// return the buffer's content
return buffer.toString();
}
Aggregations