use of us.codecraft.webmagic.Page in project webmagic by code4craft.
the class ModelPageProcessorTest method getMockPage.
private Page getMockPage() throws IOException {
Page page = new Page();
page.setRawText(IOUtils.toString(getClass().getClassLoader().getResourceAsStream("html/mock-webmagic.html")));
page.setRequest(new Request("http://webmagic.io/list/0"));
page.setUrl(new PlainText("http://webmagic.io/list/0"));
return page;
}
use of us.codecraft.webmagic.Page in project yyl_example by Relucent.
the class SpiderTest method main.
public static void main(String[] args) {
final Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
Spider spider = Spider.create(new PageProcessor() {
@Override
public void process(Page page) {
page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1", "text").toString());
page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
}
@Override
public Site getSite() {
return site;
}
}).thread(2);
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate, "石墨烯"));
list.add(String.format(urlTemplate, "气凝胶"));
list.add(String.format(urlTemplate, "液态金属"));
list.add(String.format(urlTemplate, "生物塑料"));
list.add(String.format(urlTemplate, "形状记忆合金"));
list.add(String.format(urlTemplate, "纳米纤维"));
List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll());
}
spider.close();
}
use of us.codecraft.webmagic.Page in project webmagic by code4craft.
the class HttpClientDownloaderTest method test_set_site_header.
@Test
public void test_set_site_header() throws Exception {
HttpServer server = httpServer(13423);
server.get(eq(header("header"), "header-webmagic")).response("ok");
Runner.running(server, new Runnable() {
@Override
public void run() throws Exception {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Request request = new Request();
request.setUrl("http://127.0.0.1:13423");
Page page = httpClientDownloader.download(request, Site.me().addHeader("header", "header-webmagic").toTask());
assertThat(page.getRawText()).isEqualTo("ok");
}
});
}
use of us.codecraft.webmagic.Page in project webmagic by code4craft.
the class HttpClientDownloaderTest method test_download_auth_by_SimpleProxyProvider.
@Test
public void test_download_auth_by_SimpleProxyProvider() throws Exception {
HttpServer server = httpServer(13423);
server.get(eq(header("Proxy-Authorization"), "Basic dXNlcm5hbWU6cGFzc3dvcmQ=")).response("ok");
Runner.running(server, new Runnable() {
@Override
public void run() throws Exception {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
httpClientDownloader.setProxyProvider(SimpleProxyProvider.from(new Proxy("127.0.0.1", 13423, "username", "password")));
Request request = new Request();
request.setUrl("http://www.baidu.com");
Page page = httpClientDownloader.download(request, Site.me().toTask());
assertThat(page.getRawText()).isEqualTo("ok");
}
});
}
use of us.codecraft.webmagic.Page in project webmagic by code4craft.
the class HttpClientDownloaderTest method test_set_request_header.
@Test
public void test_set_request_header() throws Exception {
HttpServer server = httpServer(13423);
server.get(eq(header("header"), "header-webmagic")).response("ok");
Runner.running(server, new Runnable() {
@Override
public void run() throws Exception {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Request request = new Request();
request.setUrl("http://127.0.0.1:13423");
request.addHeader("header", "header-webmagic");
Page page = httpClientDownloader.download(request, Site.me().toTask());
assertThat(page.getRawText()).isEqualTo("ok");
}
});
}
Aggregations