use of org.apache.tika.sax.ContentHandlerDecorator in project lucene-solr by apache.
the class TikaEntityProcessor method getHtmlHandler.
private static ContentHandler getHtmlHandler(Writer writer) throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory) TransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.setResult(new StreamResult(writer));
return new ContentHandlerDecorator(handler) {
@Override
public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
if (XHTMLContentHandler.XHTML.equals(uri)) {
uri = null;
}
if (!"head".equals(localName)) {
super.startElement(uri, localName, name, atts);
}
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
if (XHTMLContentHandler.XHTML.equals(uri)) {
uri = null;
}
if (!"head".equals(localName)) {
super.endElement(uri, localName, name);
}
}
@Override
public void startPrefixMapping(String prefix, String uri) {
/*no op*/
}
@Override
public void endPrefixMapping(String prefix) {
/*no op*/
}
};
}
use of org.apache.tika.sax.ContentHandlerDecorator in project tika by apache.
the class ContentHandlerExample method parseToPlainTextChunks.
/**
* Example of extracting the plain text in chunks, with each chunk
* of no more than a certain maximum size
*/
public List<String> parseToPlainTextChunks() throws IOException, SAXException, TikaException {
final List<String> chunks = new ArrayList<>();
chunks.add("");
ContentHandlerDecorator handler = new ContentHandlerDecorator() {
@Override
public void characters(char[] ch, int start, int length) {
String lastChunk = chunks.get(chunks.size() - 1);
String thisStr = new String(ch, start, length);
if (lastChunk.length() + length > MAXIMUM_TEXT_CHUNK_SIZE) {
chunks.add(thisStr);
} else {
chunks.set(chunks.size() - 1, lastChunk + thisStr);
}
}
};
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test2.doc")) {
parser.parse(stream, handler, metadata);
return chunks;
}
}
use of org.apache.tika.sax.ContentHandlerDecorator in project tika by apache.
the class TikaGUI method getHtmlHandler.
/**
* Creates and returns a content handler that turns XHTML input to
* simplified HTML output that can be correctly parsed and displayed
* by {@link JEditorPane}.
* <p>
* The returned content handler is set to output <code>html</code>
* to the given writer. The XHTML namespace is removed from the output
* to prevent the serializer from using the <tag/> empty element
* syntax that causes extra ">" characters to be displayed.
* The <head> tags are dropped to prevent the serializer from
* generating a <META> content type tag that makes
* {@link JEditorPane} fail thinking that the document character set
* is inconsistent.
* <p>
* Additionally, it will use ImageSavingParser to re-write embedded:(image)
* image links to be file:///(temporary file) so that they can be loaded.
*
* @param writer output writer
* @return HTML content handler
* @throws TransformerConfigurationException if an error occurs
*/
private ContentHandler getHtmlHandler(Writer writer) throws TransformerConfigurationException {
SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.setResult(new StreamResult(writer));
return new ContentHandlerDecorator(handler) {
@Override
public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
if (XHTMLContentHandler.XHTML.equals(uri)) {
uri = null;
}
if (!"head".equals(localName)) {
if ("img".equals(localName)) {
AttributesImpl newAttrs;
if (atts instanceof AttributesImpl) {
newAttrs = (AttributesImpl) atts;
} else {
newAttrs = new AttributesImpl(atts);
}
for (int i = 0; i < newAttrs.getLength(); i++) {
if ("src".equals(newAttrs.getLocalName(i))) {
String src = newAttrs.getValue(i);
if (src.startsWith("embedded:")) {
String filename = src.substring(src.indexOf(':') + 1);
try {
File img = imageParser.requestSave(filename);
String newSrc = img.toURI().toString();
newAttrs.setValue(i, newSrc);
} catch (IOException e) {
System.err.println("Error creating temp image file " + filename);
// The html viewer will show a broken image too to alert them
}
}
}
}
super.startElement(uri, localName, name, newAttrs);
} else {
super.startElement(uri, localName, name, atts);
}
}
}
@Override
public void endElement(String uri, String localName, String name) throws SAXException {
if (XHTMLContentHandler.XHTML.equals(uri)) {
uri = null;
}
if (!"head".equals(localName)) {
super.endElement(uri, localName, name);
}
}
@Override
public void startPrefixMapping(String prefix, String uri) {
}
@Override
public void endPrefixMapping(String prefix) {
}
};
}
Aggregations