Search in sources :

Example 1 with Site

use of us.codecraft.webmagic.Site in project yyl_example by Relucent.

the class SpiderTest method main.

public static void main(String[] args) {
    final Site site = Site.me().setRetryTimes(3).setSleepTime(1000).setUseGzip(true);
    Spider spider = Spider.create(new PageProcessor() {

        @Override
        public void process(Page page) {
            page.putField("name", page.getHtml().css("dl.lemmaWgt-lemmaTitle h1", "text").toString());
            page.putField("description", page.getHtml().xpath("//div[@class='lemma-summary']/allText()"));
        }

        @Override
        public Site getSite() {
            return site;
        }
    }).thread(2);
    String urlTemplate = "http://baike.baidu.com/search/word?word=%s&pic=1&sug=1&enc=utf8";
    List<String> list = new ArrayList<String>();
    list.add(String.format(urlTemplate, "石墨烯"));
    list.add(String.format(urlTemplate, "气凝胶"));
    list.add(String.format(urlTemplate, "液态金属"));
    list.add(String.format(urlTemplate, "生物塑料"));
    list.add(String.format(urlTemplate, "形状记忆合金"));
    list.add(String.format(urlTemplate, "纳米纤维"));
    List<ResultItems> resultItemses = spider.<ResultItems>getAll(list);
    for (ResultItems resultItemse : resultItemses) {
        System.out.println(resultItemse.getAll());
    }
    spider.close();
}
Also used : Site(us.codecraft.webmagic.Site) PageProcessor(us.codecraft.webmagic.processor.PageProcessor) BaiduBaikePageProcessor(us.codecraft.webmagic.processor.example.BaiduBaikePageProcessor) ResultItems(us.codecraft.webmagic.ResultItems) Spider(us.codecraft.webmagic.Spider) ArrayList(java.util.ArrayList) Page(us.codecraft.webmagic.Page)

Example 2 with Site

use of us.codecraft.webmagic.Site in project webmagic by code4craft.

the class HttpClientDownloaderTest method testGetHtmlCharset.

@Test
public void testGetHtmlCharset() throws Exception {
    HttpServer server = httpServer(13423);
    server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk"));
    server.get(by(uri("/meta4"))).response(with(text("<html>\n" + "  <head>\n" + "    <meta charset='gbk'/>\n" + "  </head>\n" + "  <body></body>\n" + "</html>")), header("Content-Type", "text/html; charset=gbk"));
    server.get(by(uri("/meta5"))).response(with(text("<html>\n" + "  <head>\n" + "    <meta http-equiv=\"Content-Type\" content=\"text/html; charset=gbk\" />\n" + "  </head>\n" + "  <body></body>\n" + "</html>")), header("Content-Type", "text/html"));
    Runner.running(server, new Runnable() {

        @Override
        public void run() {
            String charset = getCharsetByUrl("http://127.0.0.1:13423/header");
            assertEquals(charset, "gbk");
            charset = getCharsetByUrl("http://127.0.0.1:13423/meta4");
            assertEquals(charset, "gbk");
            charset = getCharsetByUrl("http://127.0.0.1:13423/meta5");
            assertEquals(charset, "gbk");
        }

        private String getCharsetByUrl(String url) {
            HttpClientDownloader downloader = new HttpClientDownloader();
            Site site = Site.me();
            CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site);
            // encoding in http header Content-Type
            Request requestGBK = new Request(url);
            CloseableHttpResponse httpResponse = null;
            try {
                httpResponse = httpClient.execute(new HttpUriRequestConverter().convert(requestGBK, site, null).getHttpUriRequest());
            } catch (IOException e) {
                e.printStackTrace();
            }
            String charset = null;
            try {
                byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
                charset = CharsetUtils.detectCharset(httpResponse.getEntity().getContentType().getValue(), contentBytes);
            } catch (IOException e) {
                e.printStackTrace();
            }
            return charset;
        }
    });
}
Also used : Site(us.codecraft.webmagic.Site) CloseableHttpClient(org.apache.http.impl.client.CloseableHttpClient) Runnable(com.github.dreamhead.moco.Runnable) HttpServer(com.github.dreamhead.moco.HttpServer) HttpUriRequest(org.apache.http.client.methods.HttpUriRequest) Request(us.codecraft.webmagic.Request) CloseableHttpResponse(org.apache.http.client.methods.CloseableHttpResponse) IOException(java.io.IOException) Test(org.junit.Test)

Example 3 with Site

use of us.codecraft.webmagic.Site in project webmagic by code4craft.

the class HttpClientDownloaderTest method test_selectRequestMethod.

@Test
public void test_selectRequestMethod() throws Exception {
    final int port = 13423;
    HttpServer server = httpServer(port);
    server.get(eq(query("q"), "webmagic")).response("get");
    server.post(eq(form("q"), "webmagic")).response("post");
    server.put(eq(form("q"), "webmagic")).response("put");
    server.delete(eq(query("q"), "webmagic")).response("delete");
    server.request(and(by(method("HEAD")), eq(query("q"), "webmagic"))).response(header("method", "head"));
    server.request(and(by(method("TRACE")), eq(query("q"), "webmagic"))).response("trace");
    final HttpUriRequestConverter httpUriRequestConverter = new HttpUriRequestConverter();
    final Site site = Site.me();
    Runner.running(server, new Runnable() {

        @Override
        public void run() throws Exception {
            Request request = new Request();
            request.setUrl("http://127.0.0.1:" + port + "/search?q=webmagic");
            request.setMethod(HttpConstant.Method.GET);
            Map<String, Object> params = new HashedMap();
            params.put("q", "webmagic");
            HttpUriRequest httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
            assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("get");
            request.setMethod(HttpConstant.Method.DELETE);
            httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
            assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("delete");
            request.setMethod(HttpConstant.Method.HEAD);
            httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
            assertThat(HttpClients.custom().build().execute(httpUriRequest).getFirstHeader("method").getValue()).isEqualTo("head");
            request.setMethod(HttpConstant.Method.TRACE);
            httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
            assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("trace");
            request.setUrl("http://127.0.0.1:" + port + "/search");
            request.setMethod(HttpConstant.Method.POST);
            request.setRequestBody(HttpRequestBody.form(params, "utf-8"));
            httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
            assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("post");
            request.setMethod(HttpConstant.Method.PUT);
            httpUriRequest = httpUriRequestConverter.convert(request, site, null).getHttpUriRequest();
            assertThat(EntityUtils.toString(HttpClients.custom().build().execute(httpUriRequest).getEntity())).isEqualTo("put");
        }
    });
}
Also used : Site(us.codecraft.webmagic.Site) HttpUriRequest(org.apache.http.client.methods.HttpUriRequest) Runnable(com.github.dreamhead.moco.Runnable) HttpServer(com.github.dreamhead.moco.HttpServer) HttpUriRequest(org.apache.http.client.methods.HttpUriRequest) Request(us.codecraft.webmagic.Request) HashedMap(org.apache.commons.collections.map.HashedMap) Map(java.util.Map) HashedMap(org.apache.commons.collections.map.HashedMap) IOException(java.io.IOException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Test(org.junit.Test)

Example 4 with Site

use of us.codecraft.webmagic.Site in project webmagic by code4craft.

the class SeleniumDownloaderTest method test.

@Ignore("need chrome driver")
@Test
public void test() {
    SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
    long time1 = System.currentTimeMillis();
    for (int i = 0; i < 100; i++) {
        Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {

            @Override
            public String getUUID() {
                return "huaban.com";
            }

            @Override
            public Site getSite() {
                return Site.me();
            }
        });
        System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
    }
    System.out.println(System.currentTimeMillis() - time1);
}
Also used : Site(us.codecraft.webmagic.Site) Task(us.codecraft.webmagic.Task) Request(us.codecraft.webmagic.Request) Page(us.codecraft.webmagic.Page) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 5 with Site

use of us.codecraft.webmagic.Site in project webmagic by code4craft.

the class SeleniumDownloaderTest method testBaiduWenku.

@Ignore
@Test
public void testBaiduWenku() {
    SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
    seleniumDownloader.setSleepTime(10000);
    long time1 = System.currentTimeMillis();
    Page page = seleniumDownloader.download(new Request("http://wenku.baidu.com/view/462933ff04a1b0717fd5ddc2.html"), new Task() {

        @Override
        public String getUUID() {
            return "huaban.com";
        }

        @Override
        public Site getSite() {
            return Site.me();
        }
    });
    System.out.println(page.getHtml().$("div.inner").replace("<[^<>]+>", "").replace("&nsbp;", "").all());
}
Also used : Site(us.codecraft.webmagic.Site) Task(us.codecraft.webmagic.Task) Request(us.codecraft.webmagic.Request) Page(us.codecraft.webmagic.Page) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

Site (us.codecraft.webmagic.Site)8 Request (us.codecraft.webmagic.Request)6 Test (org.junit.Test)5 Page (us.codecraft.webmagic.Page)5 HttpServer (com.github.dreamhead.moco.HttpServer)3 Runnable (com.github.dreamhead.moco.Runnable)3 IOException (java.io.IOException)3 HttpUriRequest (org.apache.http.client.methods.HttpUriRequest)3 Task (us.codecraft.webmagic.Task)3 UnsupportedEncodingException (java.io.UnsupportedEncodingException)2 Map (java.util.Map)2 Ignore (org.junit.Ignore)2 ResultItems (us.codecraft.webmagic.ResultItems)2 ArrayList (java.util.ArrayList)1 HashedMap (org.apache.commons.collections.map.HashedMap)1 CloseableHttpResponse (org.apache.http.client.methods.CloseableHttpResponse)1 CloseableHttpClient (org.apache.http.impl.client.CloseableHttpClient)1 BeforeClass (org.junit.BeforeClass)1 Cookie (org.openqa.selenium.Cookie)1 WebDriver (org.openqa.selenium.WebDriver)1