Search in sources :

Example 11 with Page

use of us.codecraft.webmagic.Page in project webmagic by code4craft.

the class HttpClientDownloader method handleResponse.

protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
    String content = getContent(charset, httpResponse);
    Page page = new Page();
    page.setRawText(content);
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
    return page;
}
Also used : PlainText(us.codecraft.webmagic.selector.PlainText) Page(us.codecraft.webmagic.Page)

Example 12 with Page

use of us.codecraft.webmagic.Page in project webmagic by code4craft.

the class AbstractDownloader method addToCycleRetry.

protected Page addToCycleRetry(Request request, Site site) {
    Page page = new Page();
    Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
    if (cycleTriedTimesObject == null) {
        page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
    } else {
        int cycleTriedTimes = (Integer) cycleTriedTimesObject;
        cycleTriedTimes++;
        if (cycleTriedTimes >= site.getCycleRetryTimes()) {
            return null;
        }
        page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
    }
    page.setNeedCycleRetry(true);
    return page;
}
Also used : Page(us.codecraft.webmagic.Page)

Example 13 with Page

use of us.codecraft.webmagic.Page in project webmagic by code4craft.

the class HttpClientDownloader method download.

@Override
public Page download(Request request, Task task) {
    Site site = null;
    if (task != null) {
        site = task.getSite();
    }
    Set<Integer> acceptStatCode;
    String charset = null;
    Map<String, String> headers = null;
    if (site != null) {
        acceptStatCode = site.getAcceptStatCode();
        charset = site.getCharset();
        headers = site.getHeaders();
    } else {
        acceptStatCode = WMCollections.newHashSet(200);
    }
    logger.info("downloading page {}", request.getUrl());
    CloseableHttpResponse httpResponse = null;
    int statusCode = 0;
    try {
        HttpHost proxyHost = null;
        //TODO
        Proxy proxy = null;
        if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
            proxy = site.getHttpProxyFromPool();
            proxyHost = proxy.getHttpHost();
        } else if (site != null && site.getHttpProxy() != null) {
            proxyHost = site.getHttpProxy();
        }
        HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
        httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);
        statusCode = httpResponse.getStatusLine().getStatusCode();
        request.putExtra(Request.STATUS_CODE, statusCode);
        if (statusAccept(acceptStatCode, statusCode)) {
            Page page = handleResponse(request, charset, httpResponse, task);
            onSuccess(request);
            return page;
        } else {
            logger.warn("get page {} error, status code {} ", request.getUrl(), statusCode);
            return null;
        }
    } catch (IOException e) {
        logger.warn("download page {} error", request.getUrl(), e);
        if (site != null && site.getCycleRetryTimes() > 0) {
            return addToCycleRetry(request, site);
        }
        onError(request);
        return null;
    } finally {
        if (httpResponse != null) {
            //ensure the connection is released back to pool
            EntityUtils.consumeQuietly(httpResponse.getEntity());
        }
        request.putExtra(Request.STATUS_CODE, statusCode);
        if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
            site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request.getExtra(Request.STATUS_CODE));
        }
    }
}
Also used : Site(us.codecraft.webmagic.Site) HttpUriRequest(org.apache.http.client.methods.HttpUriRequest) Proxy(us.codecraft.webmagic.proxy.Proxy) HttpHost(org.apache.http.HttpHost) CloseableHttpResponse(org.apache.http.client.methods.CloseableHttpResponse) Page(us.codecraft.webmagic.Page) IOException(java.io.IOException)

Example 14 with Page

use of us.codecraft.webmagic.Page in project webmagic by code4craft.

the class PhantomJSDownloader method download.

@Override
public Page download(Request request, Task task) {
    if (logger.isInfoEnabled()) {
        logger.info("downloading page: " + request.getUrl());
    }
    String content = getPage(request);
    if (content.contains("HTTP request failed")) {
        for (int i = 1; i <= getRetryNum(); i++) {
            content = getPage(request);
            if (!content.contains("HTTP request failed")) {
                break;
            }
        }
        if (content.contains("HTTP request failed")) {
            //when failed
            Page page = new Page();
            page.setRequest(request);
            return page;
        }
    }
    Page page = new Page();
    page.setRawText(content);
    page.setUrl(new PlainText(request.getUrl()));
    page.setRequest(request);
    page.setStatusCode(200);
    return page;
}
Also used : PlainText(us.codecraft.webmagic.selector.PlainText) Page(us.codecraft.webmagic.Page)

Example 15 with Page

use of us.codecraft.webmagic.Page in project webmagic by code4craft.

the class HttpClientDownloaderTest method testCycleTriedTimes.

@Test
public void testCycleTriedTimes() {
    HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
    Task task = Site.me().setDomain("localhost").setCycleRetryTimes(5).toTask();
    Request request = new Request(PAGE_ALWAYS_NOT_EXISTS);
    Page page = httpClientDownloader.download(request, task);
    assertThat(page.getTargetRequests().size() > 0);
    assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(1);
    page = httpClientDownloader.download(page.getTargetRequests().get(0), task);
    assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(2);
}
Also used : Task(us.codecraft.webmagic.Task) Request(us.codecraft.webmagic.Request) Page(us.codecraft.webmagic.Page) Test(org.junit.Test)

Aggregations

Page (us.codecraft.webmagic.Page)15 Request (us.codecraft.webmagic.Request)9 Test (org.junit.Test)7 PlainText (us.codecraft.webmagic.selector.PlainText)6 Site (us.codecraft.webmagic.Site)5 Ignore (org.junit.Ignore)3 Task (us.codecraft.webmagic.Task)3 Html (us.codecraft.webmagic.selector.Html)3 IOException (java.io.IOException)2 HttpServer (com.github.dreamhead.moco.HttpServer)1 Runnable (com.github.dreamhead.moco.Runnable)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 ArrayList (java.util.ArrayList)1 Map (java.util.Map)1 HttpHost (org.apache.http.HttpHost)1 CloseableHttpResponse (org.apache.http.client.methods.CloseableHttpResponse)1 HttpUriRequest (org.apache.http.client.methods.HttpUriRequest)1 Cookie (org.openqa.selenium.Cookie)1 WebDriver (org.openqa.selenium.WebDriver)1 WebElement (org.openqa.selenium.WebElement)1