use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class ContentHandlerExample method parseToPlainText.
/**
* Example of extracting the plain text of the contents.
* Will return only the "body" part of the document
*/
public String parseToPlainText() throws IOException, SAXException, TikaException {
BodyContentHandler handler = new BodyContentHandler();
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
parser.parse(stream, handler, metadata);
return handler.toString();
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class ContentHandlerExample method parseOnePartToHTML.
/**
* Example of extracting just one part of the document's body,
* as HTML as a string, excluding the rest
*/
public String parseOnePartToHTML() throws IOException, SAXException, TikaException {
// Only get things under html -> body -> div (class=header)
XPathParser xhtmlParser = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
Matcher divContentMatcher = xhtmlParser.parse("/xhtml:html/xhtml:body/xhtml:div/descendant::node()");
ContentHandler handler = new MatchingContentHandler(new ToXMLContentHandler(), divContentMatcher);
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test2.doc")) {
parser.parse(stream, handler, metadata);
return handler.toString();
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class ContentHandlerExample method parseToHTML.
/**
* Example of extracting the contents as HTML, as a string.
*/
public String parseToHTML() throws IOException, SAXException, TikaException {
ContentHandler handler = new ToXMLContentHandler();
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
parser.parse(stream, handler, metadata);
return handler.toString();
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class ContentHandlerExample method parseToPlainTextChunks.
/**
* Example of extracting the plain text in chunks, with each chunk
* of no more than a certain maximum size
*/
public List<String> parseToPlainTextChunks() throws IOException, SAXException, TikaException {
final List<String> chunks = new ArrayList<>();
chunks.add("");
ContentHandlerDecorator handler = new ContentHandlerDecorator() {
@Override
public void characters(char[] ch, int start, int length) {
String lastChunk = chunks.get(chunks.size() - 1);
String thisStr = new String(ch, start, length);
if (lastChunk.length() + length > MAXIMUM_TEXT_CHUNK_SIZE) {
chunks.add(thisStr);
} else {
chunks.set(chunks.size() - 1, lastChunk + thisStr);
}
}
};
AutoDetectParser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test2.doc")) {
parser.parse(stream, handler, metadata);
return chunks;
}
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class RFC822ParserTest method testNormalZipAttachment.
/**
* Test TIKA-1028 - Ensure we can get the contents of an
* un-encrypted zip file
*/
@Test
public void testNormalZipAttachment() throws Exception {
Parser parser = new RFC822Parser();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
context.set(Parser.class, new AutoDetectParser());
InputStream stream = getStream("test-documents/testRFC822_normal_zip");
ContentHandler handler = new BodyContentHandler();
parser.parse(stream, handler, metadata, context);
// Check we go the metadata
assertEquals("Juha Haaga <juha.haaga@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
// Check we got the message text, for both Plain Text and HTML
assertContains("Includes a normal, unencrypted zip file", handler.toString());
assertContains("This is the Plain Text part", handler.toString());
assertContains("This is the HTML part", handler.toString());
// We get both name and contents of the zip file's contents
assertContains("text.txt", handler.toString());
assertContains("TEST DATA FOR TIKA.", handler.toString());
assertContains("This is text inside an unencrypted zip file", handler.toString());
assertContains("TIKA-1028", handler.toString());
assertEquals("<juha.haaga@gmail.com>", metadata.get("Message:Raw-Header:Return-Path"));
}
Aggregations