use of org.apache.tika.sax.xpath.XPathParser in project tika by apache.
the class ContentHandlerExample method parseOnePartToHTML.
/**
* Example of extracting just one part of the document's body,
* as HTML as a string, excluding the rest
*/
public String parseOnePartToHTML() throws IOException, SAXException, TikaException {
// Only get things under html -> body -> div (class=header)
XPathParser xhtmlParser = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
Matcher divContentMatcher = xhtmlParser.parse("/xhtml:html/xhtml:body/xhtml:div/descendant::node()");
ContentHandler handler = new MatchingContentHandler(new ToXMLContentHandler(), divContentMatcher);
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test2.doc")) {
parser.parse(stream, handler, metadata);
return handler.toString();
}
}
Aggregations