use of com.cv4j.netdiscovery.core.parser.selector.Html in project NetDiscovery by fengzhizi715.
the class Spider method run.
public void run() {
checkRunningStat();
initialDelay();
try {
while (getSpiderStatus() != SPIDER_STATUS_STOPPED) {
// 暂停抓取
if (pause) {
try {
this.pauseCountDown.await();
} catch (InterruptedException e) {
log.error("can't pause : ", e);
}
initialDelay();
}
final Request request = queue.poll(name);
if (request != null) {
if (request.getSleepTime() > 0) {
try {
Thread.sleep(request.getSleepTime());
} catch (InterruptedException e) {
e.printStackTrace();
}
}
if (autoProxy && request.getProxy() == null) {
Proxy proxy = ProxyPool.getProxy();
if (proxy != null && Utils.checkProxy(proxy)) {
request.proxy(proxy);
}
}
if (request.getBeforeRequest() != null) {
request.getBeforeRequest().process(request);
}
downloader.download(request).map(new Function<Response, Page>() {
@Override
public Page apply(Response response) throws Exception {
Page page = new Page();
page.setRequest(request);
page.setUrl(request.getUrl());
page.setStatusCode(response.getStatusCode());
if (Utils.isTextType(response.getContentType())) {
// text/html
page.setHtml(new Html(response.getContent()));
return page;
} else if (Utils.isApplicationJSONType(response.getContentType())) {
// application/json
// 将json字符串转化成Json对象,放入Page的"RESPONSE_JSON"字段。之所以转换成Json对象,是因为Json提供了toObject(),可以转换成具体的class。
page.putField(Constant.RESPONSE_JSON, new Json(new String(response.getContent())));
return page;
} else {
// 保存InputStream
page.putField(Constant.RESPONSE_RAW, response.getIs());
return page;
}
}
}).map(new Function<Page, Page>() {
@Override
public Page apply(Page page) throws Exception {
if (parser != null) {
parser.process(page);
}
return page;
}
}).map(new Function<Page, Page>() {
@Override
public Page apply(Page page) throws Exception {
if (Preconditions.isNotBlank(pipelines)) {
pipelines.stream().forEach(pipeline -> pipeline.process(page.getResultItems()));
}
return page;
}
}).observeOn(Schedulers.io()).subscribe(new Consumer<Page>() {
@Override
public void accept(Page page) throws Exception {
log.info(page.getUrl());
if (request.getAfterRequest() != null) {
request.getAfterRequest().process(page);
}
}
}, new Consumer<Throwable>() {
@Override
public void accept(Throwable throwable) throws Exception {
log.error(throwable.getMessage());
}
});
} else {
break;
}
}
} finally {
// 爬虫停止
stopSpider(downloader);
}
}
Aggregations