Search in sources :

Example 1 with ResourceQueue

use of com.virjar.vscrawler.core.resourcemanager.ResourceQueue in project vscrawler by virjar.

the class VSCrawlerBuilder method build.

public VSCrawler build() {
    final VSCrawlerContext vsCrawlerContext = VSCrawlerContext.create(crawlerName);
    if (crawlerHttpClientGenerator == null) {
        crawlerHttpClientGenerator = new DefaultHttpClientGenerator();
    }
    if (proxyStrategy == null) {
        proxyStrategy = ProxyStrategy.NONE;
    }
    if (proxyStrategy == ProxyStrategy.CUSTOM && proxyPlanner == null) {
        throw new IllegalStateException("proxyPlanner must exist if proxyStrategy is custom");
    }
    CrawlerSessionPool crawlerSessionPool = new CrawlerSessionPool(vsCrawlerContext, crawlerHttpClientGenerator, proxyStrategy, ipPool, proxyPlanner, sessionPoolMaxSize, sessionPoolCoreSize, sessionPoolInitialSize, sessionPoolReuseDuration, sessionPoolMaxOnlineDuration, autoCreateSession);
    if (initSeedSource == null) {
        initSeedSource = new LocalFileSeedSource();
    }
    if (seedKeyResolver == null) {
        seedKeyResolver = new DefaultSeedKeyResolver();
    }
    if (segmentResolver == null) {
        segmentResolver = new DefaultSegmentResolver();
    }
    BerkeleyDBSeedManager berkeleyDBSeedManager = new BerkeleyDBSeedManager(vsCrawlerContext, initSeedSource, seedKeyResolver, segmentResolver, seedManagerCacheSize);
    if (processor == null && seedRouters.isEmpty()) {
        processor = new PageDownLoadProcessor();
    }
    if (processor != null && !seedRouters.isEmpty()) {
        throw new IllegalStateException(" seedProcessor and routeProcessor conflict");
    }
    if (!seedRouters.isEmpty()) {
        RouteProcessor routeProcessor = new RouteProcessor();
        routeProcessor.addRouters(seedRouters);
        processor = routeProcessor;
    }
    if (pipelineList.isEmpty()) {
        pipelineList.add(ConsolePipeline.instance);
    }
    VSCrawler vsCrawler = new VSCrawler(vsCrawlerContext, crawlerSessionPool, berkeleyDBSeedManager, processor, pipelineList, workerThreadNumber, slowStart, slowStartDuration);
    if (loginOnSessionCreate) {
        if (userResourceFacade == null) {
            userResourceFacade = new DefaultUserResource();
        }
    }
    if (resourceManager == null) {
        resourceManager = ResourceManagerFactory.create().build();
    }
    vsCrawlerContext.setResourceManager(resourceManager);
    if (queueStorePlanner == null) {
        queueStorePlanner = new RamQueueStorePlanner();
    }
    vsCrawlerContext.setQueueStorePlanner(queueStorePlanner);
    if (defaultResourceSetting == null) {
        defaultResourceSetting = ResourceSetting.create().setLock(true);
    }
    vsCrawlerContext.setResourceSetting(defaultResourceSetting);
    if (userResourceFacade != null) {
        if (loginHandler == null) {
            throw new IllegalStateException("login handler is null ,but open login switch");
        }
        ResourceQueue resourceQueue = resourceManager.getResourceQueue(vsCrawlerContext.makeUserResourceTag());
        if (resourceQueue != null) {
            resourceQueue.addResourceLoader(new UserManager2ResourceLoader(userResourceFacade));
        } else {
            resourceManager.registry(new ResourceQueue(vsCrawlerContext.makeUserResourceTag(), queueStorePlanner, defaultResourceSetting, new UserManager2ResourceLoader(userResourceFacade)));
        }
        addEventObserver(new AutoLoginPlugin(loginHandler, new UserManager2(resourceManager, vsCrawlerContext)));
    }
    if (stopWhileTaskEmptyDuration > 0) {
        final VSCrawler finalVSCrawler = vsCrawler;
        addEventObserver(new ShutDownChecker() {

            @Override
            public void checkShutDown(VSCrawlerContext vsCrawlerContext1) {
                // 15s之后检查活跃线程数,发现为0,证明连续10s都没用任务执行了
                if (finalVSCrawler.activeWorker() == 0 && (System.currentTimeMillis() - finalVSCrawler.getLastActiveTime()) > 10000) {
                    log.info((stopWhileTaskEmptyDuration / 1000) + "秒没收到爬虫任务,自动爬虫关闭器,尝试停止爬虫");
                    finalVSCrawler.stopCrawler();
                }
            }
        });
        addEventObserver(new SeedEmptyEvent() {

            @Override
            public void onSeedEmpty(VSCrawlerContext vsCrawlerContext1) {
                finalVSCrawler.getVsCrawlerContext().getAutoEventRegistry().createDelayEventSender(ShutDownChecker.class, stopWhileTaskEmptyDuration).delegate().checkShutDown(vsCrawlerContext);
            }
        });
    }
    if (eventObservers.size() > 0) {
        vsCrawler.addCrawlerStartCallBack(new VSCrawler.CrawlerStartCallBack() {

            @Override
            public void onCrawlerStart(VSCrawler vsCrawler) {
                AutoEventRegistry autoEventRegistry = vsCrawler.getVsCrawlerContext().getAutoEventRegistry();
                for (Object eventObserver : eventObservers) {
                    autoEventRegistry.registerObserver(eventObserver);
                }
            }
        });
        for (Object observer : eventObservers) {
            if (observer instanceof VSCrawler.CrawlerStartCallBack) {
                vsCrawler.addCrawlerStartCallBack((VSCrawler.CrawlerStartCallBack) observer);
            }
        }
    }
    return vsCrawler;
}
Also used : ShutDownChecker(com.virjar.vscrawler.core.event.systemevent.ShutDownChecker) PageDownLoadProcessor(com.virjar.vscrawler.core.processor.PageDownLoadProcessor) RamQueueStorePlanner(com.virjar.vscrawler.core.resourcemanager.storage.ram.RamQueueStorePlanner) SeedEmptyEvent(com.virjar.vscrawler.core.event.systemevent.SeedEmptyEvent) DefaultHttpClientGenerator(com.virjar.vscrawler.core.net.DefaultHttpClientGenerator) CrawlerSessionPool(com.virjar.vscrawler.core.net.session.CrawlerSessionPool) RouteProcessor(com.virjar.vscrawler.core.processor.RouteProcessor) BindRouteProcessor(com.virjar.vscrawler.core.processor.BindRouteProcessor) ResourceQueue(com.virjar.vscrawler.core.resourcemanager.ResourceQueue) AutoEventRegistry(com.virjar.vscrawler.core.event.support.AutoEventRegistry)

Example 2 with ResourceQueue

use of com.virjar.vscrawler.core.resourcemanager.ResourceQueue in project vscrawler by virjar.

the class ResourceManagerTest method main.

public static void main(String[] args) {
    ResourceManager resourceManager = ResourceManagerFactory.create().registryResourceQueue(new ResourceQueue(tag, new RamQueueStorePlanner(), ResourceSetting.create().setLock(true).setLockForceLeaseDuration(100), new ResourceLoader() {

        @Override
        public boolean loadResource(Collection<ResourceItem> resourceItems) {
            for (int i = 0; i < 100; i++) {
                ResourceItem resourceItem = new ResourceItem();
                resourceItem.setKey("key_" + i);
                resourceItem.setData(resourceItem.getKey());
                resourceItems.add(resourceItem);
            }
            return false;
        }
    })).build();
    for (int i = 0; i < 1000000; i++) {
        ResourceItem resourceItem = resourceManager.allocate(tag);
        String data = "null";
        if (resourceItem != null) {
            feedBackRandom(tag, resourceManager, resourceItem);
            // feedBackAlways(tag, resourceManager, resourceItem);
            data = resourceItem.getData();
        }
        if (i % 100 == 0) {
            printQueueStatus(resourceManager, tag);
        }
    }
    printQueueStatus(resourceManager, tag);
}
Also used : ResourceLoader(com.virjar.vscrawler.core.resourcemanager.service.ResourceLoader) RamQueueStorePlanner(com.virjar.vscrawler.core.resourcemanager.storage.ram.RamQueueStorePlanner) ResourceManager(com.virjar.vscrawler.core.resourcemanager.ResourceManager) ResourceItem(com.virjar.vscrawler.core.resourcemanager.model.ResourceItem) ResourceQueue(com.virjar.vscrawler.core.resourcemanager.ResourceQueue)

Example 3 with ResourceQueue

use of com.virjar.vscrawler.core.resourcemanager.ResourceQueue in project vscrawler by virjar.

the class ImeiTest method main.

public static void main(String[] args) {
    ResourceQueue resourceQueue = new ResourceQueue("android_imei", new RamQueueStorePlanner(), ResourceSetting.create().setLock(true), new ResourceLoader() {

        private BufferedReader reader = new BufferedReader(new InputStreamReader(ImeiTest.class.getResourceAsStream("/imei.txt")));

        private static final int batchSize = 100;

        private boolean closed = false;

        @Override
        public boolean loadResource(Collection<ResourceItem> resourceItems) {
            if (closed) {
                return false;
            }
            String line;
            int readSize = 0;
            try {
                while ((line = reader.readLine()) != null) {
                    ResourceItem resourceItem = new ResourceItem();
                    resourceItem.setData(line);
                    resourceItem.setKey(line.split(",")[0].trim());
                    resourceItems.add(resourceItem);
                    readSize++;
                    if (readSize > batchSize) {
                        return true;
                    }
                }
                IOUtils.closeQuietly(reader);
                closed = true;
                return false;
            } catch (IOException ioe) {
                ioe.printStackTrace();
                IOUtils.closeQuietly(reader);
                closed = true;
                return false;
            }
        }
    });
    ResourceManager resourceManager = ResourceManagerFactory.create().registryResourceQueue(resourceQueue).build();
    int allocatedTimes = 0;
    int notAllocatedTimes = 0;
    for (int i = 0; i < 1000; i++) {
        ResourceItem resourceItem = resourceManager.allocate("android_imei");
        if (resourceItem != null) {
            if (i < 50) {
                resourceManager.feedBack("android_imei", resourceItem.getKey(), true);
            } else if (i < 100) {
                resourceManager.feedBack("android_imei", resourceItem.getKey(), false);
            } else if (i < 150) {
                resourceManager.forbidden("android_imei", resourceItem.getKey());
            }
            allocatedTimes++;
            System.out.println(resourceItem.getData());
        } else {
            notAllocatedTimes++;
            System.out.println("none");
        }
    }
    System.out.println("allocatedTimes: " + allocatedTimes + "  notAllocatedTimes: " + notAllocatedTimes);
}
Also used : ResourceLoader(com.virjar.vscrawler.core.resourcemanager.service.ResourceLoader) InputStreamReader(java.io.InputStreamReader) IOException(java.io.IOException) ResourceManager(com.virjar.vscrawler.core.resourcemanager.ResourceManager) RamQueueStorePlanner(com.virjar.vscrawler.core.resourcemanager.storage.ram.RamQueueStorePlanner) BufferedReader(java.io.BufferedReader) ResourceItem(com.virjar.vscrawler.core.resourcemanager.model.ResourceItem) ResourceQueue(com.virjar.vscrawler.core.resourcemanager.ResourceQueue)

Aggregations

ResourceQueue (com.virjar.vscrawler.core.resourcemanager.ResourceQueue)3 RamQueueStorePlanner (com.virjar.vscrawler.core.resourcemanager.storage.ram.RamQueueStorePlanner)3 ResourceManager (com.virjar.vscrawler.core.resourcemanager.ResourceManager)2 ResourceItem (com.virjar.vscrawler.core.resourcemanager.model.ResourceItem)2 ResourceLoader (com.virjar.vscrawler.core.resourcemanager.service.ResourceLoader)2 AutoEventRegistry (com.virjar.vscrawler.core.event.support.AutoEventRegistry)1 SeedEmptyEvent (com.virjar.vscrawler.core.event.systemevent.SeedEmptyEvent)1 ShutDownChecker (com.virjar.vscrawler.core.event.systemevent.ShutDownChecker)1 DefaultHttpClientGenerator (com.virjar.vscrawler.core.net.DefaultHttpClientGenerator)1 CrawlerSessionPool (com.virjar.vscrawler.core.net.session.CrawlerSessionPool)1 BindRouteProcessor (com.virjar.vscrawler.core.processor.BindRouteProcessor)1 PageDownLoadProcessor (com.virjar.vscrawler.core.processor.PageDownLoadProcessor)1 RouteProcessor (com.virjar.vscrawler.core.processor.RouteProcessor)1 BufferedReader (java.io.BufferedReader)1 IOException (java.io.IOException)1 InputStreamReader (java.io.InputStreamReader)1