use of us.codecraft.webmagic.Page in project webmagic by code4craft.
the class HttpClientDownloader method handleResponse.
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = getContent(charset, httpResponse);
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
return page;
}
use of us.codecraft.webmagic.Page in project webmagic by code4craft.
the class AbstractDownloader method addToCycleRetry.
protected Page addToCycleRetry(Request request, Site site) {
Page page = new Page();
Object cycleTriedTimesObject = request.getExtra(Request.CYCLE_TRIED_TIMES);
if (cycleTriedTimesObject == null) {
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, 1));
} else {
int cycleTriedTimes = (Integer) cycleTriedTimesObject;
cycleTriedTimes++;
if (cycleTriedTimes >= site.getCycleRetryTimes()) {
return null;
}
page.addTargetRequest(request.setPriority(0).putExtra(Request.CYCLE_TRIED_TIMES, cycleTriedTimes));
}
page.setNeedCycleRetry(true);
return page;
}
use of us.codecraft.webmagic.Page in project webmagic by code4craft.
the class HttpClientDownloader method download.
@Override
public Page download(Request request, Task task) {
Site site = null;
if (task != null) {
site = task.getSite();
}
Set<Integer> acceptStatCode;
String charset = null;
Map<String, String> headers = null;
if (site != null) {
acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = WMCollections.newHashSet(200);
}
logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
int statusCode = 0;
try {
HttpHost proxyHost = null;
//TODO
Proxy proxy = null;
if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
proxy = site.getHttpProxyFromPool();
proxyHost = proxy.getHttpHost();
} else if (site != null && site.getHttpProxy() != null) {
proxyHost = site.getHttpProxy();
}
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers, proxyHost);
httpResponse = getHttpClient(site, proxy).execute(httpUriRequest);
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
Page page = handleResponse(request, charset, httpResponse, task);
onSuccess(request);
return page;
} else {
logger.warn("get page {} error, status code {} ", request.getUrl(), statusCode);
return null;
}
} catch (IOException e) {
logger.warn("download page {} error", request.getUrl(), e);
if (site != null && site.getCycleRetryTimes() > 0) {
return addToCycleRetry(request, site);
}
onError(request);
return null;
} finally {
if (httpResponse != null) {
//ensure the connection is released back to pool
EntityUtils.consumeQuietly(httpResponse.getEntity());
}
request.putExtra(Request.STATUS_CODE, statusCode);
if (site != null && site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
site.returnHttpProxyToPool((HttpHost) request.getExtra(Request.PROXY), (Integer) request.getExtra(Request.STATUS_CODE));
}
}
}
use of us.codecraft.webmagic.Page in project webmagic by code4craft.
the class PhantomJSDownloader method download.
@Override
public Page download(Request request, Task task) {
if (logger.isInfoEnabled()) {
logger.info("downloading page: " + request.getUrl());
}
String content = getPage(request);
if (content.contains("HTTP request failed")) {
for (int i = 1; i <= getRetryNum(); i++) {
content = getPage(request);
if (!content.contains("HTTP request failed")) {
break;
}
}
if (content.contains("HTTP request failed")) {
//when failed
Page page = new Page();
page.setRequest(request);
return page;
}
}
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(200);
return page;
}
use of us.codecraft.webmagic.Page in project webmagic by code4craft.
the class HttpClientDownloaderTest method testCycleTriedTimes.
@Test
public void testCycleTriedTimes() {
HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
Task task = Site.me().setDomain("localhost").setCycleRetryTimes(5).toTask();
Request request = new Request(PAGE_ALWAYS_NOT_EXISTS);
Page page = httpClientDownloader.download(request, task);
assertThat(page.getTargetRequests().size() > 0);
assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(1);
page = httpClientDownloader.download(page.getTargetRequests().get(0), task);
assertThat((Integer) page.getTargetRequests().get(0).getExtra(Request.CYCLE_TRIED_TIMES)).isEqualTo(2);
}
Aggregations