Search in sources :

Example 61 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class ExcelParserTest method testJXL.

@Test
public void testJXL() throws Exception {
    try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/jxl.xls")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler(-1);
        ParseContext context = new ParseContext();
        context.set(Locale.class, Locale.US);
        new OfficeParser().parse(input, handler, metadata, context);
        assertEquals("application/vnd.ms-excel", metadata.get(Metadata.CONTENT_TYPE));
        String content = handler.toString();
        assertContains("Number Formats", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 62 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class RFC822ParserTest method testI18NHeaders.

@Test
public void testI18NHeaders() {
    Parser parser = new RFC822Parser();
    Metadata metadata = new Metadata();
    InputStream stream = getStream("test-documents/testRFC822_i18nheaders");
    ContentHandler handler = mock(DefaultHandler.class);
    try {
        parser.parse(stream, handler, metadata, new ParseContext());
        //tests correct decoding of internationalized headers, both
        //quoted-printable (Q) and Base64 (B).
        assertEquals("Keld Jørn Simonsen <keld@dkuug.dk>", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("If you can read this you understand the example.", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("If you can read this you understand the example.", metadata.get(Metadata.SUBJECT));
    } catch (Exception e) {
        fail("Exception thrown: " + e.getMessage());
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) SAXException(org.xml.sax.SAXException) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParserTest(org.apache.tika.parser.ocr.TesseractOCRParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 63 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class RFC822ParserTest method getDate.

private Date getDate(String dateString) throws Exception {
    String mail = "From: dev@tika.apache.org\n" + "Date: " + dateString + "\n";
    Parser p = new RFC822Parser();
    Metadata m = new Metadata();
    try (InputStream is = TikaInputStream.get(mail.getBytes(StandardCharsets.UTF_8))) {
        p.parse(is, new DefaultHandler(), m, new ParseContext());
    }
    return m.getDate(TikaCoreProperties.CREATED);
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) DefaultHandler(org.xml.sax.helpers.DefaultHandler)

Example 64 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class RFC822ParserTest method testSimple.

@Test
public void testSimple() throws Exception {
    Parser parser = new RFC822Parser();
    Metadata metadata = new Metadata();
    InputStream stream = getStream("test-documents/testRFC822");
    ContentHandler handler = mock(DefaultHandler.class);
    ParseContext context = new ParseContext();
    context.set(Parser.class, new AutoDetectParser());
    try {
        parser.parse(stream, handler, metadata, context);
        verify(handler).startDocument();
        //just one body
        verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
        verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
        //no multi-part body parts
        verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
        verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
        verify(handler).endDocument();
        //note no leading spaces, and no quotes
        assertEquals("Julien Nioche (JIRA) <jira@apache.org>", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(Metadata.SUBJECT));
    } catch (Exception e) {
        fail("Exception thrown: " + e.getMessage());
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Attributes(org.xml.sax.Attributes) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) SAXException(org.xml.sax.SAXException) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParserTest(org.apache.tika.parser.ocr.TesseractOCRParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 65 with ParseContext

use of org.apache.tika.parser.ParseContext in project tika by apache.

the class RFC822ParserTest method testUnusualFromAddress.

/**
     * The from isn't in the usual form.
     * See TIKA-618
     */
@Test
public void testUnusualFromAddress() throws Exception {
    Parser parser = new RFC822Parser();
    Metadata metadata = new Metadata();
    InputStream stream = getStream("test-documents/testRFC822_oddfrom");
    ContentHandler handler = mock(DefaultHandler.class);
    parser.parse(stream, handler, metadata, new ParseContext());
    assertEquals("Saved by Windows Internet Explorer 7", metadata.get(TikaCoreProperties.CREATOR));
    assertEquals("Air Permit Programs | Air & Radiation | US EPA", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Air Permit Programs | Air & Radiation | US EPA", metadata.get(Metadata.SUBJECT));
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParserTest(org.apache.tika.parser.ocr.TesseractOCRParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

ParseContext (org.apache.tika.parser.ParseContext)336 Metadata (org.apache.tika.metadata.Metadata)281 Test (org.junit.Test)260 InputStream (java.io.InputStream)195 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)195 TikaTest (org.apache.tika.TikaTest)186 ContentHandler (org.xml.sax.ContentHandler)163 AutoDetectParser (org.apache.tika.parser.AutoDetectParser)117 Parser (org.apache.tika.parser.Parser)107 ByteArrayInputStream (java.io.ByteArrayInputStream)91 TikaInputStream (org.apache.tika.io.TikaInputStream)77 DefaultHandler (org.xml.sax.helpers.DefaultHandler)52 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)31 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)31 TikaException (org.apache.tika.exception.TikaException)29 StringWriter (java.io.StringWriter)26 IOException (java.io.IOException)24 SAXException (org.xml.sax.SAXException)24 CompositeParser (org.apache.tika.parser.CompositeParser)22 FileInputStream (java.io.FileInputStream)19