Search in sources :

Example 86 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class HtmlParserTest method testXHTMLWithMisleading.

@Test
public void testXHTMLWithMisleading() throws Exception {
    //first test an acceptable XHTML header with http-equiv tags
    String test = "<?xml version=\"1.0\" ?>" + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" + "<head>\n" + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\" />\n" + "<title>title</title></head><body>body</body></html>";
    Metadata metadata = new Metadata();
    new AutoDetectParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("text/html; charset=iso-8859-1", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
    assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
    test = "<?xml version=\"1.0\" ?>" + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" + "<head>\n" + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-NUMBER_SEVEN\" />\n" + "<title>title</title></head><body>body</body></html>";
    metadata = new Metadata();
    new AutoDetectParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
    assertEquals("text/html; charset=iso-NUMBER_SEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
    assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 87 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class DBFParserTest method testSpecificTruncated.

@Test
public void testSpecificTruncated() throws Exception {
    XMLResult r = getXML(truncate("testDBF.dbf", 781), new AutoDetectParser(), new Metadata());
    String xml = r.xml.replaceAll("[\\t\\r\\n]", " ");
    //if you don't keep track of bytes read, you could get content from prev row
    assertNotContained("holt red hath in every", xml);
    assertNotContained("<td>holt</td> <td>18.0</td>", xml);
    //check that the last row ends with holt but is correctly formatted
    assertContains("<td>holt</td> <td /> <td /></tr>", xml);
}
Also used : Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 88 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class FontParsersTest method testAdobeFontMetricParsing.

@Test
public void testAdobeFontMetricParsing() throws Exception {
    // Should auto-detect!
    Parser parser = new AutoDetectParser();
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    ParseContext context = new ParseContext();
    try (TikaInputStream stream = TikaInputStream.get(FontParsersTest.class.getResource("/test-documents/testAFM.afm"))) {
        parser.parse(stream, handler, metadata, context);
    }
    assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE));
    assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE));
    assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(Metadata.CREATION_DATE));
    assertEquals("TestFontName", metadata.get(MET_FONT_NAME));
    assertEquals("TestFullName", metadata.get(MET_FONT_FULL_NAME));
    assertEquals("TestSymbol", metadata.get(MET_FONT_FAMILY_NAME));
    assertEquals("Medium", metadata.get(MET_FONT_WEIGHT));
    assertEquals("001.008", metadata.get(MET_FONT_VERSION));
    String content = handler.toString();
    // Test that the comments got extracted
    assertContains("Comments", content);
    assertContains("This is a comment in a sample file", content);
    assertContains("UniqueID 12345", content);
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) Metadata(org.apache.tika.metadata.Metadata) ParseContext(org.apache.tika.parser.ParseContext) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) TikaInputStream(org.apache.tika.io.TikaInputStream) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Parser(org.apache.tika.parser.Parser) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) AdobeFontMetricParser(org.apache.tika.parser.font.AdobeFontMetricParser) Test(org.junit.Test)

Example 89 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class HtmlParserTest method testSkippingCommentsInEncodingDetection.

@Test
public void testSkippingCommentsInEncodingDetection() throws Exception {
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < 10000; i++) {
        sb.append(" ");
    }
    byte[] bytes = new String("<html><head>" + "<!--<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\"> -->\n" + "   <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" + "</head>" + sb.toString() + "<body>" + "有什么需要我帮你的" + "</body></html>").getBytes(StandardCharsets.UTF_8);
    XMLResult r = getXML(new ByteArrayInputStream(bytes), new AutoDetectParser(), new Metadata());
    assertContains("有什么需要我帮你的", r.xml);
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Example 90 with AutoDetectParser

use of org.apache.tika.parser.AutoDetectParser in project tika by apache.

the class ODFParserTest method testODPMasterFooter.

@Test
public void testODPMasterFooter() throws Exception {
    try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testMasterFooter.odp")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        new AutoDetectParser().parse(input, handler, metadata);
        String content = handler.toString();
        assertContains("Master footer is here", content);
    }
}
Also used : BodyContentHandler(org.apache.tika.sax.BodyContentHandler) TikaInputStream(org.apache.tika.io.TikaInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) AutoDetectParser(org.apache.tika.parser.AutoDetectParser) BodyContentHandler(org.apache.tika.sax.BodyContentHandler) ContentHandler(org.xml.sax.ContentHandler) Test(org.junit.Test) TikaTest(org.apache.tika.TikaTest)

Aggregations

AutoDetectParser (org.apache.tika.parser.AutoDetectParser)164 Metadata (org.apache.tika.metadata.Metadata)136 Test (org.junit.Test)122 InputStream (java.io.InputStream)117 Parser (org.apache.tika.parser.Parser)111 ParseContext (org.apache.tika.parser.ParseContext)103 BodyContentHandler (org.apache.tika.sax.BodyContentHandler)96 ContentHandler (org.xml.sax.ContentHandler)91 TikaTest (org.apache.tika.TikaTest)82 TikaInputStream (org.apache.tika.io.TikaInputStream)63 ByteArrayInputStream (java.io.ByteArrayInputStream)34 CompositeParser (org.apache.tika.parser.CompositeParser)28 TikaConfig (org.apache.tika.config.TikaConfig)18 ExcelParserTest (org.apache.tika.parser.microsoft.ExcelParserTest)17 WordParserTest (org.apache.tika.parser.microsoft.WordParserTest)17 TesseractOCRParser (org.apache.tika.parser.ocr.TesseractOCRParser)15 EmptyParser (org.apache.tika.parser.EmptyParser)13 XHTMLContentHandler (org.apache.tika.sax.XHTMLContentHandler)13 DefaultHandler (org.xml.sax.helpers.DefaultHandler)12 TikaException (org.apache.tika.exception.TikaException)11