Search in sources :

Example 66 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class ZlibParserTest method testZlibParsing.

@Test
public void testZlibParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = ZipParserTest.class.getResourceAsStream("/test-documents/testTXT.zlib")) {
        parser.parse(stream, handler, metadata, recursingContext);
    }
    assertEquals("application/zlib", metadata.get(Metadata.CONTENT_TYPE));
    String content = handler.toString();
    assertContains("Test d'indexation de Txt", content);
    assertContains("http://www.apache.org", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test)

Example 67 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class MboxParserTest method setUp.

@Before
public void setUp() throws Exception {
    typeDetector = new TypeDetector();
    autoDetectParser = new AutoDetectParser(typeDetector);
    recursingContext = new ParseContext();
    recursingContext.set(Parser.class, autoDetectParser);
    mboxParser = new MboxParser();
    mboxParser.setTracking(true);
}
Also used : ParseContext(org.apache.tika.parser.ParseContext) TypeDetector(org.apache.tika.detect.TypeDetector) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Before(org.junit.Before)

Example 68 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class OutlookPSTParserTest method testParse.

@Test
public void testParse() throws Exception {
    Parser pstParser = new AutoDetectParser();
    Metadata metadata = new Metadata();
    ContentHandler handler = new ToHTMLContentHandler();
    ParseContext context = new ParseContext();
    EmbeddedTrackingExtrator trackingExtrator = new EmbeddedTrackingExtrator(context);
    context.set(EmbeddedDocumentExtractor.class, trackingExtrator);
    context.set(Parser.class, new AutoDetectParser());
    pstParser.parse(getResourceAsStream("/test-documents/testPST.pst"), handler, metadata, context);
    String output = handler.toString();
    assertFalse(output.isEmpty());
    assertTrue(output.contains("<meta name=\"Content-Length\" content=\"271360\">"));
    assertTrue(output.contains("<meta name=\"Content-Type\" content=\"application/vnd.ms-outlook-pst\">"));
    assertTrue(output.contains("<body><div class=\"email-folder\"><h1>"));
    assertTrue(output.contains("<div class=\"embedded\" id=\"&lt;530D9CAC.5080901@gmail.com&gt;\"><h1>Re: Feature Generators</h1>"));
    assertTrue(output.contains("<div class=\"embedded\" id=\"&lt;1393363252.28814.YahooMailNeo@web140906.mail.bf1.yahoo.com&gt;\"><h1>Re: init tokenizer fails: \"Bad type in putfield/putstatic\"</h1>"));
    assertTrue(output.contains("Gary Murphy commented on TIKA-1250:"));
    assertTrue(output.contains("<div class=\"email-folder\"><h1>Racine (pour la recherche)</h1>"));
    List<Metadata> metaList = trackingExtrator.trackingMetadata;
    assertEquals(6, metaList.size());
    Metadata firstMail = metaList.get(0);
    assertEquals("Jörn Kottmann", firstMail.get(TikaCoreProperties.CREATOR));
    assertEquals("Re: Feature Generators", firstMail.get(TikaCoreProperties.TITLE));
    assertEquals("kottmann@gmail.com", firstMail.get("senderEmailAddress"));
    assertEquals("users@opennlp.apache.org", firstMail.get("displayTo"));
    assertEquals("", firstMail.get("displayCC"));
    assertEquals("", firstMail.get("displayBCC"));
}
Also used : ToHTMLContentHandler(org.apache.tika.sax.ToHTMLContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ToHTMLContentHandler(org.apache.tika.sax.ToHTMLContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 69 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class RFC822ParserTest method testSimple.

@Test
public void testSimple() throws Exception {
    Parser parser = new RFC822Parser();
    Metadata metadata = new Metadata();
    InputStream stream = getStream("test-documents/testRFC822");
    ContentHandler handler = mock(DefaultHandler.class);
    ParseContext context = new ParseContext();
    context.set(Parser.class, new AutoDetectParser());
    try {
        parser.parse(stream, handler, metadata, context);
        verify(handler).startDocument();
        //just one body
        verify(handler).startElement(eq(XHTMLContentHandler.XHTML), eq("p"), eq("p"), any(Attributes.class));
        verify(handler).endElement(XHTMLContentHandler.XHTML, "p", "p");
        //no multi-part body parts
        verify(handler, never()).startElement(eq(XHTMLContentHandler.XHTML), eq("div"), eq("div"), any(Attributes.class));
        verify(handler, never()).endElement(XHTMLContentHandler.XHTML, "div", "div");
        verify(handler).endDocument();
        //note no leading spaces, and no quotes
        assertEquals("Julien Nioche (JIRA) <jira@apache.org>", metadata.get(TikaCoreProperties.CREATOR));
        assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(TikaCoreProperties.TITLE));
        assertEquals("[jira] Commented: (TIKA-461) RFC822 messages not parsed", metadata.get(Metadata.SUBJECT));
    } catch (Exception e) {
        fail("Exception thrown: " + e.getMessage());
    }
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) Attributes(org.xml.sax.Attributes) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) XHTMLContentHandler(org.apache.tika.sax.XHTMLContentHandler) SAXException(org.xml.sax.SAXException) TikaException(org.apache.tika.exception.TikaException) IOException(java.io.IOException) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TesseractOCRParserTest(org.apache.tika.parser.ocr.TesseractOCRParserTest) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 70 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class SQLite3ParserTest method testBasic.

@Test
public void testBasic() throws Exception {
    Parser p = new AutoDetectParser();
    //test different types of input streams
    //actual inputstream, memory buffered bytearray and literal file
    InputStream[] streams = new InputStream[3];
    streams[0] = getResourceAsStream(TEST_FILE1);
    ByteArrayOutputStream bos = new ByteArrayOutputStream();
    IOUtils.copy(getResourceAsStream(TEST_FILE1), bos);
    streams[1] = new ByteArrayInputStream(bos.toByteArray());
    streams[2] = TikaInputStream.get(getResourceAsFile(TEST_FILE1));
    int tests = 0;
    for (InputStream stream : streams) {
        Metadata metadata = new Metadata();
        metadata.set(Metadata.RESOURCE_NAME_KEY, TEST_FILE_NAME);
        //1) getXML closes the stream
        //2) getXML runs recursively on the contents, so the embedded docs should show up
        XMLResult result = getXML(stream, p, metadata);
        stream.close();
        String x = result.xml;
        //first table name
        assertContains("<table name=\"my_table1\"><thead><tr>\t<th>PK</th>", x);
        //non-ascii
        assertContains("<td>普林斯顿大学</td>", x);
        //boolean
        assertContains("<td>true</td>\t<td>2015-01-02</td>", x);
        //date test
        assertContains("2015-01-04", x);
        //timestamp test
        assertContains("2015-01-03 15:17:03", x);
        //first embedded doc's image tag
        assertContains("alt=\"image1.png\"", x);
        //second embedded doc's image tag
        assertContains("alt=\"A description...\"", x);
        //second table name
        assertContains("<table name=\"my_table2\"><thead><tr>\t<th>INT_COL2</th>", x);
        Metadata post = result.metadata;
        String[] tableNames = post.getValues(Database.TABLE_NAME);
        assertEquals(2, tableNames.length);
        assertEquals("my_table1", tableNames[0]);
        assertEquals("my_table2", tableNames[1]);
        tests++;
    }
    assertEquals(3, tests);
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Parser(org.apache.tika.parser.Parser) EmptyParser(org.apache.tika.parser.EmptyParser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)167 Metadata (org.apache.tika.metadata.Metadata)139 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)112 ParseContext (org.apache.tika.parser.ParseContext)104 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)97 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 TikaException (org.apache.tika.exception.TikaException)13 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12