use of com.virjar.vscrawler.core.resourcemanager.storage.ram.RamQueueStorePlanner in project vscrawler by virjar.
the class VSCrawlerBuilder method build.
public VSCrawler build() {
final VSCrawlerContext vsCrawlerContext = VSCrawlerContext.create(crawlerName);
if (crawlerHttpClientGenerator == null) {
crawlerHttpClientGenerator = new DefaultHttpClientGenerator();
}
if (proxyStrategy == null) {
proxyStrategy = ProxyStrategy.NONE;
}
if (proxyStrategy == ProxyStrategy.CUSTOM && proxyPlanner == null) {
throw new IllegalStateException("proxyPlanner must exist if proxyStrategy is custom");
}
CrawlerSessionPool crawlerSessionPool = new CrawlerSessionPool(vsCrawlerContext, crawlerHttpClientGenerator, proxyStrategy, ipPool, proxyPlanner, sessionPoolMaxSize, sessionPoolCoreSize, sessionPoolInitialSize, sessionPoolReuseDuration, sessionPoolMaxOnlineDuration, autoCreateSession);
if (initSeedSource == null) {
initSeedSource = new LocalFileSeedSource();
}
if (seedKeyResolver == null) {
seedKeyResolver = new DefaultSeedKeyResolver();
}
if (segmentResolver == null) {
segmentResolver = new DefaultSegmentResolver();
}
BerkeleyDBSeedManager berkeleyDBSeedManager = new BerkeleyDBSeedManager(vsCrawlerContext, initSeedSource, seedKeyResolver, segmentResolver, seedManagerCacheSize);
if (processor == null && seedRouters.isEmpty()) {
processor = new PageDownLoadProcessor();
}
if (processor != null && !seedRouters.isEmpty()) {
throw new IllegalStateException(" seedProcessor and routeProcessor conflict");
}
if (!seedRouters.isEmpty()) {
RouteProcessor routeProcessor = new RouteProcessor();
routeProcessor.addRouters(seedRouters);
processor = routeProcessor;
}
if (pipelineList.isEmpty()) {
pipelineList.add(ConsolePipeline.instance);
}
VSCrawler vsCrawler = new VSCrawler(vsCrawlerContext, crawlerSessionPool, berkeleyDBSeedManager, processor, pipelineList, workerThreadNumber, slowStart, slowStartDuration);
if (loginOnSessionCreate) {
if (userResourceFacade == null) {
userResourceFacade = new DefaultUserResource();
}
}
if (resourceManager == null) {
resourceManager = ResourceManagerFactory.create().build();
}
vsCrawlerContext.setResourceManager(resourceManager);
if (queueStorePlanner == null) {
queueStorePlanner = new RamQueueStorePlanner();
}
vsCrawlerContext.setQueueStorePlanner(queueStorePlanner);
if (defaultResourceSetting == null) {
defaultResourceSetting = ResourceSetting.create().setLock(true);
}
vsCrawlerContext.setResourceSetting(defaultResourceSetting);
if (userResourceFacade != null) {
if (loginHandler == null) {
throw new IllegalStateException("login handler is null ,but open login switch");
}
ResourceQueue resourceQueue = resourceManager.getResourceQueue(vsCrawlerContext.makeUserResourceTag());
if (resourceQueue != null) {
resourceQueue.addResourceLoader(new UserManager2ResourceLoader(userResourceFacade));
} else {
resourceManager.registry(new ResourceQueue(vsCrawlerContext.makeUserResourceTag(), queueStorePlanner, defaultResourceSetting, new UserManager2ResourceLoader(userResourceFacade)));
}
addEventObserver(new AutoLoginPlugin(loginHandler, new UserManager2(resourceManager, vsCrawlerContext)));
}
if (stopWhileTaskEmptyDuration > 0) {
final VSCrawler finalVSCrawler = vsCrawler;
addEventObserver(new ShutDownChecker() {
@Override
public void checkShutDown(VSCrawlerContext vsCrawlerContext1) {
// 15s之后检查活跃线程数,发现为0,证明连续10s都没用任务执行了
if (finalVSCrawler.activeWorker() == 0 && (System.currentTimeMillis() - finalVSCrawler.getLastActiveTime()) > 10000) {
log.info((stopWhileTaskEmptyDuration / 1000) + "秒没收到爬虫任务,自动爬虫关闭器,尝试停止爬虫");
finalVSCrawler.stopCrawler();
}
}
});
addEventObserver(new SeedEmptyEvent() {
@Override
public void onSeedEmpty(VSCrawlerContext vsCrawlerContext1) {
finalVSCrawler.getVsCrawlerContext().getAutoEventRegistry().createDelayEventSender(ShutDownChecker.class, stopWhileTaskEmptyDuration).delegate().checkShutDown(vsCrawlerContext);
}
});
}
if (eventObservers.size() > 0) {
vsCrawler.addCrawlerStartCallBack(new VSCrawler.CrawlerStartCallBack() {
@Override
public void onCrawlerStart(VSCrawler vsCrawler) {
AutoEventRegistry autoEventRegistry = vsCrawler.getVsCrawlerContext().getAutoEventRegistry();
for (Object eventObserver : eventObservers) {
autoEventRegistry.registerObserver(eventObserver);
}
}
});
for (Object observer : eventObservers) {
if (observer instanceof VSCrawler.CrawlerStartCallBack) {
vsCrawler.addCrawlerStartCallBack((VSCrawler.CrawlerStartCallBack) observer);
}
}
}
return vsCrawler;
}
use of com.virjar.vscrawler.core.resourcemanager.storage.ram.RamQueueStorePlanner in project vscrawler by virjar.
the class ResourceManagerTest method main.
public static void main(String[] args) {
ResourceManager resourceManager = ResourceManagerFactory.create().registryResourceQueue(new ResourceQueue(tag, new RamQueueStorePlanner(), ResourceSetting.create().setLock(true).setLockForceLeaseDuration(100), new ResourceLoader() {
@Override
public boolean loadResource(Collection<ResourceItem> resourceItems) {
for (int i = 0; i < 100; i++) {
ResourceItem resourceItem = new ResourceItem();
resourceItem.setKey("key_" + i);
resourceItem.setData(resourceItem.getKey());
resourceItems.add(resourceItem);
}
return false;
}
})).build();
for (int i = 0; i < 1000000; i++) {
ResourceItem resourceItem = resourceManager.allocate(tag);
String data = "null";
if (resourceItem != null) {
feedBackRandom(tag, resourceManager, resourceItem);
// feedBackAlways(tag, resourceManager, resourceItem);
data = resourceItem.getData();
}
if (i % 100 == 0) {
printQueueStatus(resourceManager, tag);
}
}
printQueueStatus(resourceManager, tag);
}
use of com.virjar.vscrawler.core.resourcemanager.storage.ram.RamQueueStorePlanner in project vscrawler by virjar.
the class ImeiTest method main.
public static void main(String[] args) {
ResourceQueue resourceQueue = new ResourceQueue("android_imei", new RamQueueStorePlanner(), ResourceSetting.create().setLock(true), new ResourceLoader() {
private BufferedReader reader = new BufferedReader(new InputStreamReader(ImeiTest.class.getResourceAsStream("/imei.txt")));
private static final int batchSize = 100;
private boolean closed = false;
@Override
public boolean loadResource(Collection<ResourceItem> resourceItems) {
if (closed) {
return false;
}
String line;
int readSize = 0;
try {
while ((line = reader.readLine()) != null) {
ResourceItem resourceItem = new ResourceItem();
resourceItem.setData(line);
resourceItem.setKey(line.split(",")[0].trim());
resourceItems.add(resourceItem);
readSize++;
if (readSize > batchSize) {
return true;
}
}
IOUtils.closeQuietly(reader);
closed = true;
return false;
} catch (IOException ioe) {
ioe.printStackTrace();
IOUtils.closeQuietly(reader);
closed = true;
return false;
}
}
});
ResourceManager resourceManager = ResourceManagerFactory.create().registryResourceQueue(resourceQueue).build();
int allocatedTimes = 0;
int notAllocatedTimes = 0;
for (int i = 0; i < 1000; i++) {
ResourceItem resourceItem = resourceManager.allocate("android_imei");
if (resourceItem != null) {
if (i < 50) {
resourceManager.feedBack("android_imei", resourceItem.getKey(), true);
} else if (i < 100) {
resourceManager.feedBack("android_imei", resourceItem.getKey(), false);
} else if (i < 150) {
resourceManager.forbidden("android_imei", resourceItem.getKey());
}
allocatedTimes++;
System.out.println(resourceItem.getData());
} else {
notAllocatedTimes++;
System.out.println("none");
}
}
System.out.println("allocatedTimes: " + allocatedTimes + " notAllocatedTimes: " + notAllocatedTimes);
}
Aggregations