use of us.codecraft.webmagic.Site in project yyl_example by Relucent.
the class SpiderTest method main.
public static void main(String[] args) {
final Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
Spider spider = Spider.create(new PageProcessor() {
@Override
public void process(Page page) {
page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1", "text").toString());
page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
}
@Override
public Site getSite() {
return site;
}
}).thread(2);
String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
List<String> list = new ArrayList<String>();
list.add(String.format(urlTemplate, "石墨烯"));
list.add(String.format(urlTemplate, "气凝胶"));
list.add(String.format(urlTemplate, "液态金属"));
list.add(String.format(urlTemplate, "生物塑料"));
list.add(String.format(urlTemplate, "形状记忆合金"));
list.add(String.format(urlTemplate, "纳米纤维"));
List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
for (ResultItems resultItemse : resultItemses) {
System.out.println(resultItemse.getAll());
}
spider.close();
}
use of us.codecraft.webmagic.Site in project webmagic by code4craft.
the class HttpClientDownloaderTest method testGetHtmlCharset.
@Test
public void testGetHtmlCharset() throws Exception {
HttpServer server = httpServer(13423);
server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk"));
server.get(by(uri("/meta4"))).response(with(text("<html>\n" + " <head>\n" + " <meta charset='gbk'/>\n" + " </head>\n" + " <body></body>\n" + "</html>")), header("Content-Type", "text/html; charset=gbk"));
server.get(by(uri("/meta5"))).response(with(text("<html>\n" + " <head>\n" + " <meta http-equiv=\"Content-Type\" content=\"text/html; charset=gbk\" />\n" + " </head>\n" + " <body></body>\n" + "</html>")), header("Content-Type", "text/html"));
Runner.running(server, new Runnable() {
@Override
public void run() {
String charset = getCharsetByUrl("http://127.0.0.1:13423/header");
assertEquals(charset, "gbk");
charset = getCharsetByUrl("http://127.0.0.1:13423/meta4");
assertEquals(charset, "gbk");
charset = getCharsetByUrl("http://127.0.0.1:13423/meta5");
assertEquals(charset, "gbk");
}
private String getCharsetByUrl(String url) {
HttpClientDownloader downloader = new HttpClientDownloader();
Site site = Site.me();
CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
// encoding in http header Content-Type
Request requestGBK = new Request(url);
CloseableHttpResponse httpResponse = null;
try {
httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null).getHttpUriRequest());
} catch (IOException e) {
e.printStackTrace();
}
String charset = null;
try {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
charset = CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
} catch (IOException e) {
e.printStackTrace();
}
return charset;
}
});
}
use of us.codecraft.webmagic.Site in project webmagic by code4craft.
the class HttpClientDownloaderTest method test_selectRequestMethod.
@Test
public void test_selectRequestMethod() throws Exception {
final int port = 13423;
HttpServer server = httpServer(port);
server.get(eq(query("q"), "webmagic")).response("get");
server.post(eq(form("q"), "webmagic")).response("post");
server.put(eq(form("q"), "webmagic")).response("put");
server.delete(eq(query("q"), "webmagic")).response("delete");
server.request(and(by(method("HEAD")), eq(query("q"), "webmagic"))).response(header("method", "head"));
server.request(and(by(method("TRACE")), eq(query("q"), "webmagic"))).response("trace");
final HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
final Site site = Site.me();
Runner.running(server, new Runnable() {
@Override
public void run() throws Exception {
Request request = new Request();
request.setUrl("http://127.0.0.1:" + port + "/search?q=webmagic");
request.setMethod(HttpConstant.Method.GET);
Map<String, Object> params = new HashedMap();
params.put("q", "webmagic");
HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("get");
request.setMethod(HttpConstant.Method.DELETE);
httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("delete");
request.setMethod(HttpConstant.Method.HEAD);
httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
assertThat(HttpClients.custom().build().execute(httpUriRequest).getFirstHeader("method").getValue()).isEqualTo("head");
request.setMethod(HttpConstant.Method.TRACE);
httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace");
request.setUrl("http://127.0.0.1:" + port + "/search");
request.setMethod(HttpConstant.Method.POST);
request.setRequestBody(HttpRequestBody.form(params, "utf-8"));
httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post");
request.setMethod(HttpConstant.Method.PUT);
httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put");
}
});
}
use of us.codecraft.webmagic.Site in project webmagic by code4craft.
the class SeleniumDownloaderTest method test.
@Ignore("need chrome driver")
@Test
public void test() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
long time1 = System.currentTimeMillis();
for (int i = 0; i < 100; i++) {
Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {
@Override
public String getUUID() {
return "huaban.com";
}
@Override
public Site getSite() {
return Site.me();
}
});
System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
}
System.out.println(System.currentTimeMillis() - time1);
}
use of us.codecraft.webmagic.Site in project webmagic by code4craft.
the class SeleniumDownloaderTest method testBaiduWenku.
@Ignore
@Test
public void testBaiduWenku() {
SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
seleniumDownloader.setSleepTime(10000);
long time1 = System.currentTimeMillis();
Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {
@Override
public String getUUID() {
return "huaban.com";
}
@Override
public Site getSite() {
return Site.me();
}
});
System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>", "").replace("&nsbp;", "").all());
}
Aggregations