use of us.codecraft.webmagic.selector.Html in project webmagic by code4craft.
the class ProcessorBenchmark method test.
@Ignore
@Test
public void test() {
ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class);
Page page = new Page();
page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
page.setHtml(new Html(html));
long time = System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
modelPageProcessor.process(page);
}
System.out.println(System.currentTimeMillis() - time);
time = System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
modelPageProcessor.process(page);
}
System.out.println(System.currentTimeMillis() - time);
}
use of us.codecraft.webmagic.selector.Html in project webmagic by code4craft.
the class AmanzonPageProcessor method process.
public void process(Page page) {
Html html = page.getHtml();
List<String> questionList = html.xpath("//table[@class='tgCustomerCommunityCenterColumn']//div[@class='content']//table[@class='dataGrid']//tr").all();
if (questionList != null && questionList.size() > 1) {
//i=0是列名称,所以i从1开始
for (int i = 1; i < questionList.size(); i++) {
System.out.println(questionList.get(i));
Html tempHtml = Html.create("<table>" + questionList.get(i) + "</table>");
String comment = tempHtml.xpath("//td[@class='title']//a/text()").toString();
System.out.println(comment);
String answerNum = tempHtml.xpath("//td[@class='num']/text()").toString();
System.out.println(answerNum);
String createTime = tempHtml.xpath("//td[3]/text()").toString();
System.out.println(createTime);
/* Document doc = Jsoup.parse(questionList.get(i));
Html hmt = Html.create(questionList.get(i)) ;
String str = hmt.links().toString();
String content = doc.getElementsByTag("a").text();
String ss = doc.text();*/
}
}
}
use of us.codecraft.webmagic.selector.Html in project webmagic by code4craft.
the class HtmlTest method testNthNodesGet.
@Test
public void testNthNodesGet() {
Html html = new Html("<a data-tip=\"p$t$xxx\" href=\"/xx/xx\">xx</a>");
assertThat(html.xpath("//a[1]/@href").get()).isEqualTo("/xx/xx");
Selectable selectable = html.xpath("//a[1]").nodes().get(0);
assertThat(selectable.xpath("/a/@href").get()).isEqualTo("/xx/xx");
}
use of us.codecraft.webmagic.selector.Html in project webmagic by code4craft.
the class HtmlTest method testAHrefExtract.
@Test
public void testAHrefExtract() {
Html html = new Html("<a data-tip=\"p$t$xxx\" href=\"/xx/xx\">xx</a>");
assertThat(html.links().all()).contains("/xx/xx");
}
use of us.codecraft.webmagic.selector.Html in project webmagic by code4craft.
the class HtmlTest method testRegexSelector.
@Test
public void testRegexSelector() {
Html selectable = new Html("aaaaaaab");
assertThat(selectable.regex("(a+b)").replace("aa(a)", "$1bb").toString()).isEqualTo("abbabbab");
}
Aggregations