use of us.codecraft.webmagic.selector.PlainText in project webmagic by code4craft.
the class DiaoyuwengProcessor method process.
@Override
public void process(Page page) {
List<String> requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/home\\.php\\?mod=space&uid=88304&do=thread&view=me&type=thread&order=dateline&from=space&page=\\d+)").all();
page.addTargetRequests(requests);
requests = page.getHtml().links().regex("(http://www\\.diaoyuweng\\.com/thread-\\d+-1-1.html)").all();
page.addTargetRequests(requests);
if (page.getUrl().toString().contains("thread")) {
page.putField("title", page.getHtml().xpath("//a[@id='thread_subject']"));
page.putField("content", page.getHtml().xpath("//div[@class='pcb']//tbody/tidyText()"));
page.putField("date", page.getHtml().regex("发表于 (\\d{4}-\\d+-\\d+ \\d+:\\d+:\\d+)"));
page.putField("id", new PlainText("1000" + page.getUrl().regex("http://www\\.diaoyuweng\\.com/thread-(\\d+)-1-1.html").toString()));
}
}
use of us.codecraft.webmagic.selector.PlainText in project webmagic by code4craft.
the class ModelPageProcessorTest method getMockPage.
private Page getMockPage() throws IOException {
Page page = new Page();
page.setRawText(IOUtils.toString(getClass().getClassLoader().getResourceAsStream("html/mock-webmagic.html")));
page.setRequest(new Request("http://webmagic.io/list/0"));
page.setUrl(new PlainText("http://webmagic.io/list/0"));
return page;
}
use of us.codecraft.webmagic.selector.PlainText in project webmagic by code4craft.
the class MockGithubDownloader method download.
@Override
public Page download(Request request, Task task) {
Page page = new Page();
page.setHtml(new Html(html));
page.setRequest(new Request("https://github.com/code4craft/webmagic"));
page.setUrl(new PlainText("https://github.com/code4craft/webmagic"));
return page;
}
use of us.codecraft.webmagic.selector.PlainText in project webmagic by code4craft.
the class SeleniumDownloader method download.
@Override
public Page download(Request request, Task task) {
checkInit();
WebDriver webDriver;
try {
webDriver = webDriverPool.get();
} catch (InterruptedException e) {
logger.warn("interrupted", e);
return null;
}
logger.info("downloading page " + request.getUrl());
webDriver.get(request.getUrl());
try {
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
e.printStackTrace();
}
WebDriver.Options manage = webDriver.manage();
Site site = task.getSite();
if (site.getCookies() != null) {
for (Map.Entry<String, String> cookieEntry : site.getCookies().entrySet()) {
Cookie cookie = new Cookie(cookieEntry.getKey(), cookieEntry.getValue());
manage.addCookie(cookie);
}
}
/*
* TODO You can add mouse event or other processes
*
* @author: bob.li.0718@gmail.com
*/
WebElement webElement = webDriver.findElement(By.xpath("/html"));
String content = webElement.getAttribute("outerHTML");
Page page = new Page();
page.setRawText(content);
page.setHtml(new Html(UrlUtils.fixAllRelativeHrefs(content, request.getUrl())));
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
webDriverPool.returnToPool(webDriver);
return page;
}
use of us.codecraft.webmagic.selector.PlainText in project webmagic by code4craft.
the class ProcessorBenchmark method test.
@Ignore
@Test
public void test() {
ModelPageProcessor modelPageProcessor = ModelPageProcessor.create(Site.me().addStartUrl("http://my.oschina.net/flashsword/blog"), OschinaBlog.class);
Page page = new Page();
page.setRequest(new Request("http://my.oschina.net/flashsword/blog"));
page.setUrl(new PlainText("http://my.oschina.net/flashsword/blog"));
page.setHtml(new Html(html));
long time = System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
modelPageProcessor.process(page);
}
System.out.println(System.currentTimeMillis() - time);
time = System.currentTimeMillis();
for (int i = 0; i < 1000; i++) {
modelPageProcessor.process(page);
}
System.out.println(System.currentTimeMillis() - time);
}
Aggregations