use of edu.uci.ics.crawler4j.parser.HtmlContentHandler in project crawler4j by yasserg.
the class HtmlContentHandlerTest method parseHtml.
private HtmlContentHandler parseHtml(String html) throws Exception {
ByteArrayInputStream bais = new ByteArrayInputStream(html.getBytes());
Metadata metadata = new Metadata();
HtmlContentHandler contentHandler = new HtmlContentHandler();
parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance());
parser.parse(bais, contentHandler, metadata, parseContext);
return contentHandler;
}
use of edu.uci.ics.crawler4j.parser.HtmlContentHandler in project crawler4j by yasserg.
the class HtmlContentHandlerTest method testParaInBody.
@Test
public void testParaInBody() throws Exception {
HtmlContentHandler parse = parseHtml("<html><body><p>Hello there</p></html>");
assertEquals("Hello there", parse.getBodyText());
}
use of edu.uci.ics.crawler4j.parser.HtmlContentHandler in project crawler4j by yasserg.
the class HtmlContentHandlerTest method test2ParaInBody.
@Test
public void test2ParaInBody() throws Exception {
HtmlContentHandler parse = parseHtml("<html><body><p>Hello there</p><p>mr</p></html>");
assertEquals("Hello there mr", parse.getBodyText());
}
use of edu.uci.ics.crawler4j.parser.HtmlContentHandler in project crawler4j by yasserg.
the class HtmlContentHandlerTest method testEmpty.
@Test
public void testEmpty() throws Exception {
HtmlContentHandler parse = parseHtml("<html></html>");
assertEquals("", parse.getBodyText());
}
use of edu.uci.ics.crawler4j.parser.HtmlContentHandler in project crawler4j by yasserg.
the class HtmlContentHandlerTest method testSciptInHead.
@Test
public void testSciptInHead() throws Exception {
HtmlContentHandler parse = parseHtml("<html><head>" + "<script src=\"/js/app.js\"></script>" + "</head></html>");
ExtractedUrlAnchorPair script = parse.getOutgoingUrls().get(0);
assertEquals("/js/app.js", script.getHref());
}
Aggregations