use of org.apache.tika.parser.html.BoilerpipeContentHandler in project camel by apache.
the class TikaProducer method getContentHandler.
private ContentHandler getContentHandler(TikaConfiguration configuration, OutputStream outputStream) throws TransformerConfigurationException, UnsupportedEncodingException {
ContentHandler result = null;
TikaParseOutputFormat outputFormat = configuration.getTikaParseOutputFormat();
switch(outputFormat) {
case xml:
result = getTransformerHandler(outputStream, "xml", true);
break;
case text:
result = new BodyContentHandler(new OutputStreamWriter(outputStream, this.encoding));
break;
case textMain:
result = new BoilerpipeContentHandler(new OutputStreamWriter(outputStream, this.encoding));
break;
case html:
result = new ExpandedTitleContentHandler(getTransformerHandler(outputStream, "html", true));
break;
default:
throw new IllegalArgumentException(String.format("Unknown format %s", tikaConfiguration.getTikaParseOutputFormat()));
}
return result;
}
use of org.apache.tika.parser.html.BoilerpipeContentHandler in project Xponents by OpenSextant.
the class TikaHTMLConverter method conversionImplementation.
/**
* a barebones HTML parser.
*
* <pre>
* TODO: mis-encoded HTML entities are not decoded
* properly. E.g., finding "–" (82xx range is dashes, quotes) for
* example, does not decode correctly unless the page encoding is declared as UTF-8.
* </pre>
*/
@Override
protected ConvertedDocument conversionImplementation(InputStream input, File doc) throws IOException {
Metadata metadata = new Metadata();
HashMap<String, String> moreMetadata = new HashMap<>();
// HTML Conversion here is simply not resetting its internal buffers
// Its just accumulating and error out when it reaches MAX
ContentHandler handler = new BodyContentHandler(maxHTMLDocumentSize);
BoilerpipeContentHandler scrubbingHandler = null;
if (scrubHTMLArticle) {
scrubbingHandler = new BoilerpipeContentHandler(handler);
}
try {
parser.parse(input, (scrubHTMLArticle ? scrubbingHandler : handler), metadata, new ParseContext());
if (doc != null) {
parseHTMLMetadata(doc, moreMetadata);
}
} catch (Exception xerr) {
throw new IOException("Unable to parse content", xerr);
} finally {
input.close();
}
ConvertedDocument textdoc = new ConvertedDocument(doc);
textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE));
String text = null;
if (scrubHTMLArticle) {
text = scrubbingHandler.getTextDocument().getText(true, false);
} else {
text = handler.toString();
}
textdoc.setText(TextUtils.reduce_line_breaks(text));
// -- Improve CHAR SET encoding answer.
byte[] data = textdoc.buffer.getBytes();
if (TextUtils.isASCII(data)) {
textdoc.setEncoding("ASCII");
} else {
// Okay, okay... let Tika name whatever encoding it found or guessed
// at.
textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING));
}
// Indicate if we tried to filter the article at all.
//
textdoc.addProperty("filtered", scrubHTMLArticle);
textdoc.addProperty("converter", TikaHTMLConverter.class.getName());
if (!moreMetadata.isEmpty()) {
for (String k : moreMetadata.keySet()) {
textdoc.addUserProperty(k, moreMetadata.get(k));
}
}
return textdoc;
}
use of org.apache.tika.parser.html.BoilerpipeContentHandler in project tika by apache.
the class TikaResource method produceTextMain.
public StreamingOutput produceTextMain(final InputStream is, @Context MultivaluedMap<String, String> httpHeaders, @Context final UriInfo info) {
final Parser parser = createParser();
final Metadata metadata = new Metadata();
final ParseContext context = new ParseContext();
fillMetadata(parser, metadata, context, httpHeaders);
fillParseContext(context, httpHeaders, parser);
logRequest(LOG, info, metadata);
return new StreamingOutput() {
public void write(OutputStream outputStream) throws IOException, WebApplicationException {
Writer writer = new OutputStreamWriter(outputStream, UTF_8);
ContentHandler handler = new BoilerpipeContentHandler(writer);
parse(parser, LOG, info.getPath(), is, handler, metadata, context);
}
};
}
Aggregations