use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class HtmlParserTest method testXHTMLWithMisleading.
@Test
public void testXHTMLWithMisleading() throws Exception {
//first test an acceptable XHTML header with http-equiv tags
String test = "<?xml version=\"1.0\" ?>" + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" + "<head>\n" + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\" />\n" + "<title>title</title></head><body>body</body></html>";
Metadata metadata = new Metadata();
new AutoDetectParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
assertEquals("text/html; charset=iso-8859-1", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
test = "<?xml version=\"1.0\" ?>" + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + "<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" + "<head>\n" + "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-NUMBER_SEVEN\" />\n" + "<title>title</title></head><body>body</body></html>";
metadata = new Metadata();
new AutoDetectParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
assertEquals("text/html; charset=iso-NUMBER_SEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
assertEquals("application/xhtml+xml; charset=ISO-8859-1", metadata.get(Metadata.CONTENT_TYPE));
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class DBFParserTest method testSpecificTruncated.
@Test
public void testSpecificTruncated() throws Exception {
XMLResult r = getXML(truncate("testDBF.dbf", 781), new AutoDetectParser(), new Metadata());
String xml = r.xml.replaceAll("[\\t\\r\\n]", " ");
//if you don't keep track of bytes read, you could get content from prev row
assertNotContained("holt red hath in every", xml);
assertNotContained("<td>holt</td> <td>18.0</td>", xml);
//check that the last row ends with holt but is correctly formatted
assertContains("<td>holt</td> <td /> <td /></tr>", xml);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class FontParsersTest method testAdobeFontMetricParsing.
@Test
public void testAdobeFontMetricParsing() throws Exception {
// Should auto-detect!
Parser parser = new AutoDetectParser();
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();
ParseContext context = new ParseContext();
try (TikaInputStream stream = TikaInputStream.get(FontParsersTest.class.getResource("/test-documents/testAFM.afm"))) {
parser.parse(stream, handler, metadata, context);
}
assertEquals("application/x-font-adobe-metric", metadata.get(Metadata.CONTENT_TYPE));
assertEquals("TestFullName", metadata.get(TikaCoreProperties.TITLE));
assertEquals("Fri Jul 15 17:50:51 2011", metadata.get(Metadata.CREATION_DATE));
assertEquals("TestFontName", metadata.get(MET_FONT_NAME));
assertEquals("TestFullName", metadata.get(MET_FONT_FULL_NAME));
assertEquals("TestSymbol", metadata.get(MET_FONT_FAMILY_NAME));
assertEquals("Medium", metadata.get(MET_FONT_WEIGHT));
assertEquals("001.008", metadata.get(MET_FONT_VERSION));
String content = handler.toString();
// Test that the comments got extracted
assertContains("Comments", content);
assertContains("This is a comment in a sample file", content);
assertContains("UniqueID 12345", content);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class HtmlParserTest method testSkippingCommentsInEncodingDetection.
@Test
public void testSkippingCommentsInEncodingDetection() throws Exception {
StringBuilder sb = new StringBuilder();
for (int i = 0; i < 10000; i++) {
sb.append(" ");
}
byte[] bytes = new String("<html><head>" + "<!--<meta http-equiv=\"Content-Type\" content=\"text/html; charset=ISO-8859-1\"> -->\n" + " <meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />" + "</head>" + sb.toString() + "<body>" + "有什么需要我帮你的" + "</body></html>").getBytes(StandardCharsets.UTF_8);
XMLResult r = getXML(new ByteArrayInputStream(bytes), new AutoDetectParser(), new Metadata());
assertContains("有什么需要我帮你的", r.xml);
}
use of org.apache.tika.parser.AutoDetectParser in project tika by apache.
the class ODFParserTest method testODPMasterFooter.
@Test
public void testODPMasterFooter() throws Exception {
try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testMasterFooter.odp")) {
Metadata metadata = new Metadata();
ContentHandler handler = new BodyContentHandler();
new AutoDetectParser().parse(input, handler, metadata);
String content = handler.toString();
assertContains("Master footer is here", content);
}
}
Aggregations