Search in sources :

Example 81 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class ContentHandlerExample method parseToPlainText.

/**
     * Example of extracting the plain text of the contents.
     * Will return only the "body" part of the document
     */
public String parseToPlainText() throws IOException, SAXException, TikaException {
    BodyContentHandler handler = new BodyContentHandler();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
        parser.parse(stream, handler, metadata);
        return handler.toString();
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser)

Example 82 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class ContentHandlerExample method parseOnePartToHTML.

/**
     * Example of extracting just one part of the document's body,
     * as HTML as a string, excluding the rest
     */
public String parseOnePartToHTML() throws IOException, SAXException, TikaException {
    // Only get things under html -> body -> div (class=header)
    XPathParser xhtmlParser = new XPathParser("xhtml", XHTMLContentHandler.XHTML);
    Matcher divContentMatcher = xhtmlParser.parse("/xhtml:html/xhtml:body/xhtml:div/descendant::node()");
    ContentHandler handler = new MatchingContentHandler(new ToXMLContentHandler(), divContentMatcher);
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test2.doc")) {
        parser.parse(stream, handler, metadata);
        return handler.toString();
    }
}
Also used : ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) XPathParser(org.apache.tika.sax.xpath.XPathParser) Matcher(org.apache.tika.sax.xpath.Matcher) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 83 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class ContentHandlerExample method parseToHTML.

/**
     * Example of extracting the contents as HTML, as a string.
     */
public String parseToHTML() throws IOException, SAXException, TikaException {
    ContentHandler handler = new ToXMLContentHandler();
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test.doc")) {
        parser.parse(stream, handler, metadata);
        return handler.toString();
    }
}
Also used : ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) MatchingContentHandler(org.apache.tika.sax.xpath.MatchingContentHandler) ToXMLContentHandler(org.apache.tika.sax.ToXMLContentHandler) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) ContentHandler(org.xml.sax.ContentHandler)

Example 84 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class ContentHandlerExample method parseToPlainTextChunks.

/**
     * Example of extracting the plain text in chunks, with each chunk
     * of no more than a certain maximum size
     */
public List<String> parseToPlainTextChunks() throws IOException, SAXException, TikaException {
    final List<String> chunks = new ArrayList<>();
    chunks.add("");
    ContentHandlerDecorator handler = new ContentHandlerDecorator() {

        @Override
        public void characters(char[] ch, int start, int length) {
            String lastChunk = chunks.get(chunks.size() - 1);
            String thisStr = new String(ch, start, length);
            if (lastChunk.length() + length > MAXIMUM_TEXT_CHUNK_SIZE) {
                chunks.add(thisStr);
            } else {
                chunks.set(chunks.size() - 1, lastChunk + thisStr);
            }
        }
    };
    AutoDetectParser parser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    try (InputStream stream = ContentHandlerExample.class.getResourceAsStream("test2.doc")) {
        parser.parse(stream, handler, metadata);
        return chunks;
    }
}
Also used : InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ContentHandlerDecorator(org.apache.tika.sax.ContentHandlerDecorator)

Example 85 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class RFC822ParserTest method testNormalZipAttachment.

/**
     * Test TIKA-1028 - Ensure we can get the contents of an
     * un-encrypted zip file
     */
@Test
public void testNormalZipAttachment() throws Exception {
    Parser parser = new RFC822Parser();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    context.set(Parser.class, new AutoDetectParser());
    InputStream stream = getStream("test-documents/testRFC822_normal_zip");
    ContentHandler handler = new BodyContentHandler();
    parser.parse(stream, handler, metadata, context);
    // Check we go the metadata
    assertEquals("Juha Haaga <juha.haaga@gmail.com>", metadata.get(Metadata.MESSAGE_FROM));
    assertEquals("Test mail for Tika", metadata.get(TikaCoreProperties.TITLE));
    // Check we got the message text, for both Plain Text and HTML
    assertContains("Includes a normal, unencrypted zip file", handler.toString());
    assertContains("This is the Plain Text part", handler.toString());
    assertContains("This is the HTML part", handler.toString());
    // We get both name and contents of the zip file's contents
    assertContains("text.txt", handler.toString());
    assertContains("TEST DATA FOR TIKA.", handler.toString());
    assertContains("This is text inside an unencrypted zip file", handler.toString());
    assertContains("TIKA-1028", handler.toString());
    assertEquals("<juha.haaga@gmail.com>", metadata.get("Message:Raw-Header:Return-Path"));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParserTest(org.apache.tika.parser.ocr.TesseractOCRParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)167 Metadata (org.apache.tika.metadata.Metadata)139 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)112 ParseContext (org.apache.tika.parser.ParseContext)104 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)97 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 TikaException (org.apache.tika.exception.TikaException)13 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12