Search in sources :

Example 1 with CrawlerSession

use of com.virjar.vscrawler.core.net.session.CrawlerSession in project vscrawler by virjar.

the class VSCrawler method grabSync.

/**
 * 同步执行抓取任务,适合booking场景,该抓取任务不入库,抓取结果不入pipeline,session创建不等待
 *
 * @param seed 任务种子
 * @return 抓取结果
 */
public GrabResult grabSync(Seed seed) {
    GrabResult crawlResult = new GrabResult();
    try {
        MDC.put("grabID", LogIdGenerator.genGrabTransactionID(vsCrawlerContext.getCrawlerName()));
        VSCrawlerCommonUtil.setGrabStartTimeStampThreadLocal(System.currentTimeMillis());
        // start component
        if (!hasComponentInit) {
            initComponentWithOutMainThread();
        }
        // set vsCrawlerContext into ThreadLocal ,for support event loop
        VSCrawlerCommonUtil.setVSCrawlerContext(vsCrawlerContext);
        // 30秒资源请求超时,防止线程阻塞
        CrawlerSession session = crawlerSessionPool.borrowOne(VSCrawlerCommonUtil.grabTaskLessTime(), true);
        if (session == null) {
            VSCrawlerMonitor.recordOne(vsCrawlerContext.getCrawlerName() + "_borrowSession_failed");
            crawlResult.setGrabSuccess(false);
            crawlResult.setErrorMessage("can not allocate session resource from session pop");
            return crawlResult;
        }
        try {
            seed.setStatus(Seed.STATUS_RUNNING);
            VSCrawlerCommonUtil.setCrawlerSession(session);
            seedProcessor.process(seed, session, crawlResult);
            return crawlResult;
        } catch (Exception e) {
            log.error("error when grab seed:{}", JSONViewWrapper.wrap(seed), e);
            throw e;
        } finally {
            // 归还一个session,session有并发控制,feedback之后session才能被其他任务复用
            VSCrawlerCommonUtil.clearCrawlerSession();
            crawlerSessionPool.recycle(session);
            try {
                MDC.remove("grabID");
            } catch (Exception e) {
                // this exception is unimportant
                log.error("failed to remove MDC variable", e);
            }
        }
    } finally {
        Long grabStartTimeStampThreadLocal = VSCrawlerCommonUtil.getGrabStartTimeStampThreadLocal();
        VSCrawlerMonitor.recordOne(vsCrawlerContext.getCrawlerName() + "_grab", grabStartTimeStampThreadLocal);
        VSCrawlerMonitor.recordOne(vsCrawlerContext.getCrawlerName() + "_grab_result_" + (crawlResult.isGrabSuccess() ? "success" : "failed"));
        VSCrawlerCommonUtil.clearGrabTimeOutControl();
    }
}
Also used : GrabResult(com.virjar.vscrawler.core.processor.GrabResult) CrawlerSession(com.virjar.vscrawler.core.net.session.CrawlerSession)

Example 2 with CrawlerSession

use of com.virjar.vscrawler.core.net.session.CrawlerSession in project vscrawler by virjar.

the class BeautyCrawler method main.

public static void main(String[] args) throws IOException {
    VSCrawler vsCrawler = VSCrawlerBuilder.create().setCrawlerName("beautyCrawler").setProcessor(new SeedProcessor() {

        private void handlePic(Seed seed, CrawlerSession crawlerSession) {
            Header[] headers = HeaderBuilder.create().withRefer(seed.getExt().get("refer")).defaultCommonHeader().buildArray();
            byte[] entity = crawlerSession.getCrawlerHttpClient().getEntity(seed.getData(), headers);
            if (entity == null) {
                seed.retry();
                return;
            }
            try {
                // 文件根据网站,路径,base自动计算
                Files.write(// 文件根据网站,路径,base自动计算
                entity, new File(PathResolver.sourceToUnderLine("~/Desktop/testpic", seed.getData())));
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

        @Override
        public void process(final Seed seed, CrawlerSession crawlerSession, GrabResult crawlResult) {
            if (StringUtils.endsWithIgnoreCase(seed.getData(), ".jpg")) {
                handlePic(seed, crawlerSession);
            } else {
                String s = crawlerSession.getCrawlerHttpClient().get(seed.getData());
                if (s == null) {
                    seed.retry();
                    return;
                }
                // 将下一页的链接和图片链接抽取出来
                crawlResult.addSeeds(Lists.newArrayList(Iterables.transform(XpathParser.compileNoError("/css('#pages a')::self()[contains(text(),'下一页')]/absUrl('href') | /css('.content')::center/img/@src").evaluateToString(Jsoup.parse(s, seed.getData())), new Function<String, Seed>() {

                    @Override
                    public Seed apply(String input) {
                        Seed ret = new Seed(input);
                        if (StringUtils.endsWith(input, ".jpg")) {
                            ret.getExt().put("refer", seed.getData());
                        }
                        return ret;
                    }
                })));
            }
        }
    }).setWorkerThreadNumber(15).setSessionPoolCoreSize(20).setSessionPoolMaxSize(25).build();
    // 清空历史爬去数据,或者会断点续爬
    vsCrawler.clearTask();
    vsCrawler.pushSeed("https://www.meitulu.com/item/2125.html");
    vsCrawler.pushSeed("https://www.meitulu.com/item/6892.html");
    vsCrawler.pushSeed("https://www.meitulu.com/item/2124.html");
    vsCrawler.pushSeed("https://www.meitulu.com/item/2120.html");
    vsCrawler.pushSeed("https://www.meitulu.com/item/2086.html");
    // 开始爬虫
    vsCrawler.start();
}
Also used : VSCrawler(com.virjar.vscrawler.core.VSCrawler) Function(com.google.common.base.Function) Seed(com.virjar.vscrawler.core.seed.Seed) GrabResult(com.virjar.vscrawler.core.processor.GrabResult) IOException(java.io.IOException) SeedProcessor(com.virjar.vscrawler.core.processor.SeedProcessor) CrawlerSession(com.virjar.vscrawler.core.net.session.CrawlerSession) File(java.io.File)

Example 3 with CrawlerSession

use of com.virjar.vscrawler.core.net.session.CrawlerSession in project vscrawler by virjar.

the class FutureCrawler method main.

public static void main(String[] args) {
    VSCrawler vsCrawler = VSCrawlerBuilder.create().setStopWhileTaskEmptyDuration(2000).setSegmentResolver(new SegmentResolver() {

        @Override
        public long resolveSegmentKey(long activeTime) {
            // 按分钟分段,每隔一分钟重新抓取链接,这里只是为了测试,实际上不能设置这么短,建议按天分段
            return new DateTime(activeTime).withSecondOfMinute(0).getMillis();
        }
    }).setProcessor(new SeedProcessor() {

        @Override
        public void process(Seed seed, CrawlerSession crawlerSession, GrabResult crawlResult) {
            // 建立一个种子副本
            Seed copy = seed.copy();
            // 设置生效时间为两分钟后
            copy.setActiveTimeStamp(DateTime.now().plusMinutes(1).getMillis());
            // 返回新种子
            crawlResult.addSeed(copy);
        }
    }).build();
    // 当前所有demo都会清空task,否则不同爬虫的数据可能紊乱
    vsCrawler.clearTask();
    vsCrawler.pushSeed("https://www.meitulu.com/item/6892.htm");
    vsCrawler.start();
}
Also used : VSCrawler(com.virjar.vscrawler.core.VSCrawler) SegmentResolver(com.virjar.vscrawler.core.seed.SegmentResolver) Seed(com.virjar.vscrawler.core.seed.Seed) GrabResult(com.virjar.vscrawler.core.processor.GrabResult) SeedProcessor(com.virjar.vscrawler.core.processor.SeedProcessor) CrawlerSession(com.virjar.vscrawler.core.net.session.CrawlerSession) DateTime(org.joda.time.DateTime)

Example 4 with CrawlerSession

use of com.virjar.vscrawler.core.net.session.CrawlerSession in project vscrawler by virjar.

the class AnnotationSeedProcessor method judgeDownloader.

private void judgeDownloader(Class<? extends AbstractAutoProcessModel> aClass) {
    Method[] methods = aClass.getMethods();
    for (final Method method : methods) {
        if (method.getAnnotation(DownLoadMethod.class) == null) {
            continue;
        }
        Preconditions.checkArgument(String.class.isAssignableFrom(method.getReturnType()));
        Preconditions.checkArgument(method.getParameterTypes().length >= 2);
        Preconditions.checkArgument(method.getParameterTypes()[0].isAssignableFrom(Seed.class));
        Preconditions.checkArgument(method.getParameterTypes()[1].isAssignableFrom(CrawlerSession.class));
        downloader = new Downloader() {

            @Override
            public String download(Seed seed, AbstractAutoProcessModel model, CrawlerSession crawlerSession) {
                try {
                    return (String) method.invoke(model, seed, crawlerSession);
                } catch (Exception e) {
                    throw new RuntimeException("invoke download method :" + method.toGenericString() + " failed", e);
                }
            }
        };
        return;
    }
    downloader = new Downloader() {

        @Override
        public String download(Seed seed, AbstractAutoProcessModel model, CrawlerSession crawlerSession) {
            return crawlerSession.getCrawlerHttpClient().get(seed.getData());
        }
    };
}
Also used : Seed(com.virjar.vscrawler.core.seed.Seed) DownLoadMethod(com.virjar.vscrawler.core.processor.configurableprocessor.annotiondriven.annotation.DownLoadMethod) Method(java.lang.reflect.Method) CrawlerSession(com.virjar.vscrawler.core.net.session.CrawlerSession) DownLoadMethod(com.virjar.vscrawler.core.processor.configurableprocessor.annotiondriven.annotation.DownLoadMethod)

Aggregations

CrawlerSession (com.virjar.vscrawler.core.net.session.CrawlerSession)4 GrabResult (com.virjar.vscrawler.core.processor.GrabResult)3 Seed (com.virjar.vscrawler.core.seed.Seed)3 VSCrawler (com.virjar.vscrawler.core.VSCrawler)2 SeedProcessor (com.virjar.vscrawler.core.processor.SeedProcessor)2 Function (com.google.common.base.Function)1 DownLoadMethod (com.virjar.vscrawler.core.processor.configurableprocessor.annotiondriven.annotation.DownLoadMethod)1 SegmentResolver (com.virjar.vscrawler.core.seed.SegmentResolver)1 File (java.io.File)1 IOException (java.io.IOException)1 Method (java.lang.reflect.Method)1 DateTime (org.joda.time.DateTime)1