use of org.apache.tika.metadata.Metadata in project tika by apache.
the class HtmlParserTest method XtestParseUTF8.
@Test
@Ignore("The file 'testXHTML_utf8.html' is not available for testing")
public void XtestParseUTF8() throws IOException, SAXException, TikaException {
String path = "/test-documents/testXHTML_utf8.html";
Metadata metadata = new Metadata();
String content = new Tika().parseToString(HtmlParserTest.class.getResourceAsStream(path), metadata);
assertTrue("Did not contain expected text:" + "Title : Tilte with UTF-8 chars öäå", content.contains("Title : Tilte with UTF-8 chars öäå"));
assertTrue("Did not contain expected text:" + "Content with UTF-8 chars", content.contains("Content with UTF-8 chars"));
assertTrue("Did not contain expected text:" + "åäö", content.contains("åäö"));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class HtmlParserTest method testParseEmpty.
@Test
public void testParseEmpty() throws Exception {
ContentHandler handler = new BodyContentHandler();
new HtmlParser().parse(new ByteArrayInputStream(new byte[0]), handler, new Metadata(), new ParseContext());
assertEquals("", handler.toString());
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class HtmlParserTest method assertRelativeLink.
private void assertRelativeLink(String url, String base, String relative) throws Exception {
String test = "<html><head><base href=\"" + base + "\"></head>" + "<body><a href=\"" + relative + "\">test</a></body></html>";
final List<String> links = new ArrayList<String>();
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new DefaultHandler() {
@Override
public void startElement(String u, String l, String name, Attributes atts) {
if (name.equals("a") && atts.getValue("", "href") != null) {
links.add(atts.getValue("", "href"));
}
}
}, new Metadata(), new ParseContext());
assertEquals(1, links.size());
assertEquals(url, links.get(0));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class HtmlParserTest method testBoilerplateWithMarkup.
/**
* Test case for TIKA-564. Support returning markup from BoilerpipeContentHandler.
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-564">TIKA-564</a>
*/
@Test
public void testBoilerplateWithMarkup() throws Exception {
String path = "/test-documents/boilerplate.html";
Metadata metadata = new Metadata();
StringWriter sw = new StringWriter();
ContentHandler ch = makeHtmlTransformer(sw);
BoilerpipeContentHandler bpch = new BoilerpipeContentHandler(ch);
bpch.setIncludeMarkup(true);
new HtmlParser().parse(HtmlParserTest.class.getResourceAsStream(path), bpch, metadata, new ParseContext());
String content = sw.toString();
assertTrue("Has empty table elements", content.contains("<body><table><tr><td><table><tr><td>"));
assertTrue("Has empty a element", content.contains("<a shape=\"rect\" href=\"Main.php\"/>"));
assertTrue("Has real content", content.contains("<p>This is the real meat"));
assertTrue("Ends with appropriate HTML", content.endsWith("</p></body></html>"));
assertFalse(content.contains("boilerplate"));
assertFalse(content.contains("footer"));
}
use of org.apache.tika.metadata.Metadata in project tika by apache.
the class HtmlParserTest method testUsingCharsetInContentTypeHeader.
/**
* Test case for TIKA-341
*
* @see <a href="https://issues.apache.org/jira/browse/TIKA-341">TIKA-341</a>
*/
@Test
public void testUsingCharsetInContentTypeHeader() throws Exception {
final String test = "<html><head><title>the name is ándre</title></head>" + "<body></body></html>";
Metadata metadata = new Metadata();
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(UTF_8)), new BodyContentHandler(), metadata, new ParseContext());
assertEquals("UTF-8", metadata.get(Metadata.CONTENT_ENCODING));
metadata = new Metadata();
metadata.set(Metadata.CONTENT_TYPE, "text/html; charset=ISO-8859-1");
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), new BodyContentHandler(), metadata, new ParseContext());
assertEquals("ISO-8859-1", metadata.get(Metadata.CONTENT_ENCODING));
}
Aggregations