use of org.ccil.cowan.tagsoup.HTMLSchema in project tika by apache.
the class HtmlParserTest method testCustomHtmlSchema.
// TIKA-1193
@Test
public void testCustomHtmlSchema() throws Exception {
// Default schema does not allow tables inside anchors
String test = "<html><body><a><table><tr><td>text</tr></tr></table></a></body></html>";
Metadata metadata = new Metadata();
LinkContentHandler linkContentHandler = new LinkContentHandler();
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), linkContentHandler, metadata, new ParseContext());
// Expect no anchor text
assertEquals("", linkContentHandler.getLinks().get(0).getText());
// We'll change the schema to allow tables inside anchors!
Schema schema = new HTMLSchema();
schema.elementType("a", HTMLSchema.M_ANY, 65535, 0);
ParseContext parseContext = new ParseContext();
parseContext.set(Schema.class, schema);
linkContentHandler = new LinkContentHandler();
new HtmlParser().parse(new ByteArrayInputStream(test.getBytes(ISO_8859_1)), linkContentHandler, metadata, parseContext);
// Expect anchor text
assertEquals("\ttext\n\n", linkContentHandler.getLinks().get(0).getText());
}
Aggregations