use of com.virjar.vscrawler.core.net.session.CrawlerSessionPool in project vscrawler by virjar.
the class VSCrawlerBuilder method build.
public VSCrawler build() {
final VSCrawlerContext vsCrawlerContext = VSCrawlerContext.create(crawlerName);
if (crawlerHttpClientGenerator == null) {
crawlerHttpClientGenerator = new DefaultHttpClientGenerator();
}
if (proxyStrategy == null) {
proxyStrategy = ProxyStrategy.NONE;
}
if (proxyStrategy == ProxyStrategy.CUSTOM && proxyPlanner == null) {
throw new IllegalStateException("proxyPlanner must exist if proxyStrategy is custom");
}
CrawlerSessionPool crawlerSessionPool = new CrawlerSessionPool(vsCrawlerContext, crawlerHttpClientGenerator, proxyStrategy, ipPool, proxyPlanner, sessionPoolMaxSize, sessionPoolCoreSize, sessionPoolInitialSize, sessionPoolReuseDuration, sessionPoolMaxOnlineDuration, autoCreateSession);
if (initSeedSource == null) {
initSeedSource = new LocalFileSeedSource();
}
if (seedKeyResolver == null) {
seedKeyResolver = new DefaultSeedKeyResolver();
}
if (segmentResolver == null) {
segmentResolver = new DefaultSegmentResolver();
}
BerkeleyDBSeedManager berkeleyDBSeedManager = new BerkeleyDBSeedManager(vsCrawlerContext, initSeedSource, seedKeyResolver, segmentResolver, seedManagerCacheSize);
if (processor == null && seedRouters.isEmpty()) {
processor = new PageDownLoadProcessor();
}
if (processor != null && !seedRouters.isEmpty()) {
throw new IllegalStateException(" seedProcessor and routeProcessor conflict");
}
if (!seedRouters.isEmpty()) {
RouteProcessor routeProcessor = new RouteProcessor();
routeProcessor.addRouters(seedRouters);
processor = routeProcessor;
}
if (pipelineList.isEmpty()) {
pipelineList.add(ConsolePipeline.instance);
}
VSCrawler vsCrawler = new VSCrawler(vsCrawlerContext, crawlerSessionPool, berkeleyDBSeedManager, processor, pipelineList, workerThreadNumber, slowStart, slowStartDuration);
if (loginOnSessionCreate) {
if (userResourceFacade == null) {
userResourceFacade = new DefaultUserResource();
}
}
if (resourceManager == null) {
resourceManager = ResourceManagerFactory.create().build();
}
vsCrawlerContext.setResourceManager(resourceManager);
if (queueStorePlanner == null) {
queueStorePlanner = new RamQueueStorePlanner();
}
vsCrawlerContext.setQueueStorePlanner(queueStorePlanner);
if (defaultResourceSetting == null) {
defaultResourceSetting = ResourceSetting.create().setLock(true);
}
vsCrawlerContext.setResourceSetting(defaultResourceSetting);
if (userResourceFacade != null) {
if (loginHandler == null) {
throw new IllegalStateException("login handler is null ,but open login switch");
}
ResourceQueue resourceQueue = resourceManager.getResourceQueue(vsCrawlerContext.makeUserResourceTag());
if (resourceQueue != null) {
resourceQueue.addResourceLoader(new UserManager2ResourceLoader(userResourceFacade));
} else {
resourceManager.registry(new ResourceQueue(vsCrawlerContext.makeUserResourceTag(), queueStorePlanner, defaultResourceSetting, new UserManager2ResourceLoader(userResourceFacade)));
}
addEventObserver(new AutoLoginPlugin(loginHandler, new UserManager2(resourceManager, vsCrawlerContext)));
}
if (stopWhileTaskEmptyDuration > 0) {
final VSCrawler finalVSCrawler = vsCrawler;
addEventObserver(new ShutDownChecker() {
@Override
public void checkShutDown(VSCrawlerContext vsCrawlerContext1) {
// 15s之后检查活跃线程数,发现为0,证明连续10s都没用任务执行了
if (finalVSCrawler.activeWorker() == 0 && (System.currentTimeMillis() - finalVSCrawler.getLastActiveTime()) > 10000) {
log.info((stopWhileTaskEmptyDuration / 1000) + "秒没收到爬虫任务,自动爬虫关闭器,尝试停止爬虫");
finalVSCrawler.stopCrawler();
}
}
});
addEventObserver(new SeedEmptyEvent() {
@Override
public void onSeedEmpty(VSCrawlerContext vsCrawlerContext1) {
finalVSCrawler.getVsCrawlerContext().getAutoEventRegistry().createDelayEventSender(ShutDownChecker.class, stopWhileTaskEmptyDuration).delegate().checkShutDown(vsCrawlerContext);
}
});
}
if (eventObservers.size() > 0) {
vsCrawler.addCrawlerStartCallBack(new VSCrawler.CrawlerStartCallBack() {
@Override
public void onCrawlerStart(VSCrawler vsCrawler) {
AutoEventRegistry autoEventRegistry = vsCrawler.getVsCrawlerContext().getAutoEventRegistry();
for (Object eventObserver : eventObservers) {
autoEventRegistry.registerObserver(eventObserver);
}
}
});
for (Object observer : eventObservers) {
if (observer instanceof VSCrawler.CrawlerStartCallBack) {
vsCrawler.addCrawlerStartCallBack((VSCrawler.CrawlerStartCallBack) observer);
}
}
}
return vsCrawler;
}
Aggregations