Search in sources :

Example 11 with Request

use of us.codecraft.webmagic.Request in project webmagic by code4craft.

the class HttpClientDownloaderTest method test_selectRequestMethod.

@Test
public void test_selectRequestMethod() throws Exception {
    HttpServer server = httpserver(12306);
    server.get(eq(query("q"), "webmagic")).response("get");
    server.post(eq(form("q"), "webmagic")).response("post");
    server.put(eq(form("q"), "webmagic")).response("put");
    server.delete(eq(query("q"), "webmagic")).response("delete");
    server.request(and(by(method("HEAD")), eq(query("q"), "webmagic"))).response(header("method", "head"));
    server.request(and(by(method("TRACE")), eq(query("q"), "webmagic"))).response("trace");
    Runner.running(server, new Runnable() {

        @Override
        public void run() throws Exception {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            Request request = new Request();
            request.setUrl("http://127.0.0.1:12306/search");
            request.putParams("q", "webmagic");
            request.setMethod(HttpConstant.Method.GET);
            RequestBuilder requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
            assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("get");
            request.setMethod(HttpConstant.Method.POST);
            requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
            assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("post");
            request.setMethod(HttpConstant.Method.PUT);
            requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
            assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("put");
            request.setMethod(HttpConstant.Method.DELETE);
            requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
            assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("delete");
            request.setMethod(HttpConstant.Method.HEAD);
            requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
            assertThat(HttpClients.custom().build().execute(requestBuilder.build()).getFirstHeader("method").getValue()).isEqualTo("head");
            request.setMethod(HttpConstant.Method.TRACE);
            requestBuilder = httpClientDownloader.selectRequestMethod(request).setUri(request.getUrl());
            assertThat(EntityUtils.toString(HttpClients.custom().build().execute(requestBuilder.build()).getEntity())).isEqualTo("trace");
        }
    });
}
Also used : RequestBuilder(org.apache.http.client.methods.RequestBuilder) Runnable(com.github.dreamhead.moco.Runnable) HttpServer(com.github.dreamhead.moco.HttpServer) Request(us.codecraft.webmagic.Request) IOException(java.io.IOException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Test(org.junit.Test)

Example 12 with Request

use of us.codecraft.webmagic.Request in project webmagic by code4craft.

the class HttpClientDownloaderTest method testGetHtmlCharset.

@Test
public void testGetHtmlCharset() throws Exception {
    HttpServer server = httpserver(12306);
    server.get(by(uri("/header"))).response(header("Content-Type", "text/html; charset=gbk"));
    server.get(by(uri("/meta4"))).response(with(text("<html>\n" + "  <head>\n" + "    <meta charset='gbk'/>\n" + "  </head>\n" + "  <body></body>\n" + "</html>")), header("Content-Type", ""));
    server.get(by(uri("/meta5"))).response(with(text("<html>\n" + "  <head>\n" + "    <meta http-equiv=\"Content-Type\" content=\"text/html; charset=gbk\" />\n" + "  </head>\n" + "  <body></body>\n" + "</html>")), header("Content-Type", ""));
    Runner.running(server, new Runnable() {

        @Override
        public void run() {
            String charset = getCharsetByUrl("http://127.0.0.1:12306/header");
            assertEquals(charset, "gbk");
            charset = getCharsetByUrl("http://127.0.0.1:12306/meta4");
            assertEquals(charset, "gbk");
            charset = getCharsetByUrl("http://127.0.0.1:12306/meta5");
            assertEquals(charset, "gbk");
        }

        private String getCharsetByUrl(String url) {
            HttpClientDownloader downloader = new HttpClientDownloader();
            Site site = Site.me();
            CloseableHttpClient httpClient = new HttpClientGenerator().getClient(site, null);
            // encoding in http header Content-Type
            Request requestGBK = new Request(url);
            CloseableHttpResponse httpResponse = null;
            try {
                httpResponse = httpClient.execute(downloader.getHttpUriRequest(requestGBK, site, null, null));
            } catch (IOException e) {
                e.printStackTrace();
            }
            String charset = null;
            try {
                byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
                charset = downloader.getHtmlCharset(httpResponse, contentBytes);
            } catch (IOException e) {
                e.printStackTrace();
            }
            return charset;
        }
    });
}
Also used : Site(us.codecraft.webmagic.Site) CloseableHttpClient(org.apache.http.impl.client.CloseableHttpClient) Runnable(com.github.dreamhead.moco.Runnable) HttpServer(com.github.dreamhead.moco.HttpServer) Request(us.codecraft.webmagic.Request) CloseableHttpResponse(org.apache.http.client.methods.CloseableHttpResponse) IOException(java.io.IOException) Test(org.junit.Test)

Example 13 with Request

use of us.codecraft.webmagic.Request in project webmagic by code4craft.

the class PriorityScheduler method poll.

@Override
public synchronized Request poll(Task task) {
    Request poll = priorityQueuePlus.poll();
    if (poll != null) {
        return poll;
    }
    poll = noPriorityQueue.poll();
    if (poll != null) {
        return poll;
    }
    return priorityQueueMinus.poll();
}
Also used : Request(us.codecraft.webmagic.Request)

Example 14 with Request

use of us.codecraft.webmagic.Request in project webmagic by code4craft.

the class MockGithubDownloader method download.

@Override
public Page download(Request request, Task task) {
    Page page = new Page();
    page.setHtml(new Html(html));
    page.setRequest(new Request("https://github.com/code4craft/webmagic"));
    page.setUrl(new PlainText("https://github.com/code4craft/webmagic"));
    return page;
}
Also used : PlainText(us.codecraft.webmagic.selector.PlainText) Request(us.codecraft.webmagic.Request) Html(us.codecraft.webmagic.selector.Html) Page(us.codecraft.webmagic.Page)

Example 15 with Request

use of us.codecraft.webmagic.Request in project webmagic by code4craft.

the class SeleniumDownloaderTest method test.

@Ignore("need chrome driver")
@Test
public void test() {
    SeleniumDownloader seleniumDownloader = new SeleniumDownloader(chromeDriverPath);
    long time1 = System.currentTimeMillis();
    for (int i = 0; i < 100; i++) {
        Page page = seleniumDownloader.download(new Request("http://huaban.com/"), new Task() {

            @Override
            public String getUUID() {
                return "huaban.com";
            }

            @Override
            public Site getSite() {
                return Site.me();
            }
        });
        System.out.println(page.getHtml().$("#waterfall").links().regex(".*pins.*").all());
    }
    System.out.println(System.currentTimeMillis() - time1);
}
Also used : Site(us.codecraft.webmagic.Site) Task(us.codecraft.webmagic.Task) Request(us.codecraft.webmagic.Request) Page(us.codecraft.webmagic.Page) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

Request (us.codecraft.webmagic.Request)27 Test (org.junit.Test)19 Page (us.codecraft.webmagic.Page)9 Ignore (org.junit.Ignore)8 Task (us.codecraft.webmagic.Task)8 Site (us.codecraft.webmagic.Site)4 DuplicateRemover (us.codecraft.webmagic.scheduler.component.DuplicateRemover)4 HttpServer (com.github.dreamhead.moco.HttpServer)3 Runnable (com.github.dreamhead.moco.Runnable)3 IOException (java.io.IOException)3 PlainText (us.codecraft.webmagic.selector.PlainText)3 UnsupportedEncodingException (java.io.UnsupportedEncodingException)2 HashSetDuplicateRemover (us.codecraft.webmagic.scheduler.component.HashSetDuplicateRemover)2 Html (us.codecraft.webmagic.selector.Html)2 Matcher (java.util.regex.Matcher)1 Pattern (java.util.regex.Pattern)1 CloseableHttpResponse (org.apache.http.client.methods.CloseableHttpResponse)1 RequestBuilder (org.apache.http.client.methods.RequestBuilder)1 CloseableHttpClient (org.apache.http.impl.client.CloseableHttpClient)1 BeforeClass (org.junit.BeforeClass)1