Search in sources :

Example 1 with Html

use of com.cv4j.netdiscovery.core.parser.selector.Html in project NetDiscovery by fengzhizi715.

the class Spider method run.

public void run() {
    checkRunningStat();
    initialDelay();
    try {
        while (getSpiderStatus() != SPIDER_STATUS_STOPPED) {
            // 暂停抓取
            if (pause) {
                try {
                    this.pauseCountDown.await();
                } catch (InterruptedException e) {
                    log.error("can't pause : ", e);
                }
                initialDelay();
            }
            final Request request = queue.poll(name);
            if (request != null) {
                if (request.getSleepTime() > 0) {
                    try {
                        Thread.sleep(request.getSleepTime());
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                }
                if (autoProxy && request.getProxy() == null) {
                    Proxy proxy = ProxyPool.getProxy();
                    if (proxy != null && Utils.checkProxy(proxy)) {
                        request.proxy(proxy);
                    }
                }
                if (request.getBeforeRequest() != null) {
                    request.getBeforeRequest().process(request);
                }
                downloader.download(request).map(new Function<Response, Page>() {

                    @Override
                    public Page apply(Response response) throws Exception {
                        Page page = new Page();
                        page.setRequest(request);
                        page.setUrl(request.getUrl());
                        page.setStatusCode(response.getStatusCode());
                        if (Utils.isTextType(response.getContentType())) {
                            // text/html
                            page.setHtml(new Html(response.getContent()));
                            return page;
                        } else if (Utils.isApplicationJSONType(response.getContentType())) {
                            // application/json
                            // 将json字符串转化成Json对象,放入Page的"RESPONSE_JSON"字段。之所以转换成Json对象,是因为Json提供了toObject(),可以转换成具体的class。
                            page.putField(Constant.RESPONSE_JSON, new Json(new String(response.getContent())));
                            return page;
                        } else {
                            // 保存InputStream
                            page.putField(Constant.RESPONSE_RAW, response.getIs());
                            return page;
                        }
                    }
                }).map(new Function<Page, Page>() {

                    @Override
                    public Page apply(Page page) throws Exception {
                        if (parser != null) {
                            parser.process(page);
                        }
                        return page;
                    }
                }).map(new Function<Page, Page>() {

                    @Override
                    public Page apply(Page page) throws Exception {
                        if (Preconditions.isNotBlank(pipelines)) {
                            pipelines.stream().forEach(pipeline -> pipeline.process(page.getResultItems()));
                        }
                        return page;
                    }
                }).observeOn(Schedulers.io()).subscribe(new Consumer<Page>() {

                    @Override
                    public void accept(Page page) throws Exception {
                        log.info(page.getUrl());
                        if (request.getAfterRequest() != null) {
                            request.getAfterRequest().process(page);
                        }
                    }
                }, new Consumer<Throwable>() {

                    @Override
                    public void accept(Throwable throwable) throws Exception {
                        log.error(throwable.getMessage());
                    }
                });
            } else {
                break;
            }
        }
    } finally {
        // 爬虫停止
        stopSpider(downloader);
    }
}
Also used : Arrays(java.util.Arrays) Queue(com.cv4j.netdiscovery.core.queue.Queue) Utils(com.cv4j.netdiscovery.core.utils.Utils) Getter(lombok.Getter) VertxDownloader(com.cv4j.netdiscovery.core.downloader.vertx.VertxDownloader) Constant(com.cv4j.netdiscovery.core.config.Constant) Flowable(io.reactivex.Flowable) Pipeline(com.cv4j.netdiscovery.core.pipeline.Pipeline) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Parser(com.cv4j.netdiscovery.core.parser.Parser) Response(com.cv4j.netdiscovery.core.domain.Response) Preconditions(com.safframework.tony.common.utils.Preconditions) ProxyPool(com.cv4j.proxy.ProxyPool) Schedulers(io.reactivex.schedulers.Schedulers) Proxy(com.cv4j.proxy.domain.Proxy) IOUtils(com.safframework.tony.common.utils.IOUtils) Page(com.cv4j.netdiscovery.core.domain.Page) Html(com.cv4j.netdiscovery.core.parser.selector.Html) Json(com.cv4j.netdiscovery.core.parser.selector.Json) LinkedHashSet(java.util.LinkedHashSet) Downloader(com.cv4j.netdiscovery.core.downloader.Downloader) DefaultQueue(com.cv4j.netdiscovery.core.queue.DefaultQueue) Set(java.util.Set) Consumer(io.reactivex.functions.Consumer) TimeUnit(java.util.concurrent.TimeUnit) CountDownLatch(java.util.concurrent.CountDownLatch) CompositeDisposable(io.reactivex.disposables.CompositeDisposable) Slf4j(lombok.extern.slf4j.Slf4j) Function(io.reactivex.functions.Function) Request(com.cv4j.netdiscovery.core.domain.Request) Request(com.cv4j.netdiscovery.core.domain.Request) Html(com.cv4j.netdiscovery.core.parser.selector.Html) Page(com.cv4j.netdiscovery.core.domain.Page) Json(com.cv4j.netdiscovery.core.parser.selector.Json) Response(com.cv4j.netdiscovery.core.domain.Response) Function(io.reactivex.functions.Function) Proxy(com.cv4j.proxy.domain.Proxy)

Aggregations

Constant (com.cv4j.netdiscovery.core.config.Constant)1 Page (com.cv4j.netdiscovery.core.domain.Page)1 Request (com.cv4j.netdiscovery.core.domain.Request)1 Response (com.cv4j.netdiscovery.core.domain.Response)1 Downloader (com.cv4j.netdiscovery.core.downloader.Downloader)1 VertxDownloader (com.cv4j.netdiscovery.core.downloader.vertx.VertxDownloader)1 Parser (com.cv4j.netdiscovery.core.parser.Parser)1 Html (com.cv4j.netdiscovery.core.parser.selector.Html)1 Json (com.cv4j.netdiscovery.core.parser.selector.Json)1 Pipeline (com.cv4j.netdiscovery.core.pipeline.Pipeline)1 DefaultQueue (com.cv4j.netdiscovery.core.queue.DefaultQueue)1 Queue (com.cv4j.netdiscovery.core.queue.Queue)1 Utils (com.cv4j.netdiscovery.core.utils.Utils)1 ProxyPool (com.cv4j.proxy.ProxyPool)1 Proxy (com.cv4j.proxy.domain.Proxy)1 IOUtils (com.safframework.tony.common.utils.IOUtils)1 Preconditions (com.safframework.tony.common.utils.Preconditions)1 Flowable (io.reactivex.Flowable)1 CompositeDisposable (io.reactivex.disposables.CompositeDisposable)1 Consumer (io.reactivex.functions.Consumer)1