use of edu.uci.ics.crawler4j.crawler.CrawlController in project Zpider by zeroized.
the class CrawlerController method setup.
//
// @RequestMapping(method = RequestMethod.POST,value = "/start")
// public String setup(@RequestParam CrawlerOptions crawlerOptions,
// @RequestParam String[] seeds) throws Exception {
// CrawlControllerOptions options=CrawlControllerFactory.defaultOptions();
// CrawlController crawlController=crawlControllerFactory.newController(options);
// CrawlerFactory crawlerFactory=new CrawlerFactory(crawlerOptions,mongoRepo);
// for (String seed : seeds) {
// crawlController.addSeed(seed);
// }
// crawlController.startNonBlocking(crawlerFactory,options.getWorkers());
// return "";
// }
// @RequestMapping(method = RequestMethod.POST,value = "/start")
// public String setup(@RequestParam List<String> seeds,
// @RequestParam List<String> allowDomains,
// @RequestParam List<String> crawlUrlPrefixes,
// @RequestParam List<Column> columns) throws Exception {
// System.out.println("/crawl/start visited");
// System.out.println(seeds);
// System.out.println(allowDomains);
// System.out.println(crawlUrlPrefixes);
// System.out.println(columns);
// CrawlControllerOptions options=CrawlControllerOptions.defaultOptions();
// CrawlController crawlController=crawlControllerFactory.newController(options);
// CrawlerOptions crawlerOptions=new CrawlerOptions(allowDomains,crawlUrlPrefixes,columns);
// CrawlerFactory crawlerFactory=new CrawlerFactory(crawlerOptions,mongoRepo);
// for (String seed:seeds){
// crawlController.addSeed(seed);
// }
// crawlController.startNonBlocking(crawlerFactory,options.getWorkers());
// return "";
// }
@RequestMapping(method = RequestMethod.POST, value = "/start")
public String setup(@RequestBody CrawlRequest crawlRequest) throws Exception {
System.out.println("/crawl/start visited");
List<String> seeds = crawlRequest.getSeeds();
List<String> allowDomains = crawlRequest.getAllowDomains().stream().map(x -> x.startsWith("http://") ? x : "http://" + x).collect(Collectors.toList());
List<String> crawlUrlPrefixes = crawlRequest.getCrawlUrlPrefixes().stream().map(x -> x.startsWith("http://") ? x : "http://" + x).collect(Collectors.toList());
List<Column> columns = crawlRequest.getColumns();
CrawlControllerOptions options = CrawlControllerOptions.defaultOptions();
options.setWorkers(crawlRequest.getAdvancedOpt().getWorkers());
options.setDelay(crawlRequest.getAdvancedOpt().getPoliteWait());
options.setDepth(crawlRequest.getAdvancedOpt().getMaxDepth());
options.setPage(crawlRequest.getAdvancedOpt().getMaxPage());
options.setDir(crawlRequest.getName() + "\\");
CrawlController crawlController = crawlControllerFactory.newController(options);
PublishSubject<Map<String, ?>> crawlSubject = PublishSubject.create();
crawlSubject.buffer(60, TimeUnit.SECONDS, Schedulers.computation(), 20, () -> Collections.synchronizedList(new LinkedList<>()), true).subscribe(elasticRepo::generateBulkIndex);
CrawlerOptions crawlerOptions = new CrawlerOptions(allowDomains, crawlUrlPrefixes, columns);
System.out.println(crawlerOptions.toString());
CrawlerFactory crawlerFactory = new CrawlerFactory(crawlerOptions, crawlSubject);
for (String seed : seeds) {
crawlController.addSeed(seed);
}
crawlController.startNonBlocking(crawlerFactory, options.getWorkers());
return "";
}
use of edu.uci.ics.crawler4j.crawler.CrawlController in project Zpider by zeroized.
the class CrawlControllerFactory method newController.
public CrawlController newController(CrawlControllerOptions options) throws Exception {
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(baseDir + options.getDir());
config.setMaxDepthOfCrawling(options.getDepth());
config.setPolitenessDelay(options.getDelay());
config.setResumableCrawling(options.isResumeable());
config.setDefaultHeaders(options.getHeaders());
if (options.getPage() != -1) {
config.setMaxPagesToFetch(options.getPage());
}
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
return new CrawlController(config, pageFetcher, robotstxtServer);
}
use of edu.uci.ics.crawler4j.crawler.CrawlController in project yyl_example by Relucent.
the class MyControllerTest method main.
public static void main(String[] args) throws Exception {
// #爬虫配置
CrawlConfig config = new CrawlConfig();
// #设置爬取深度
config.setMaxDepthOfCrawling(5);
// #设置页面抓取的最大数量
config.setMaxPagesToFetch(Integer.MAX_VALUE);
// 每次请求前等待200毫秒
config.setPolitenessDelay(200);
// 爬取数据存储文件夹
config.setCrawlStorageFolder(System.getProperty("user.dir") + "/temp/crawl");
// #设置代理
// config.setProxyHost("proxyserver.example.com");
// config.setProxyPort(8080);
// config.setProxyUsername(username);
// config.getProxyPassword(password);
// #配置恢复停止/崩溃的爬虫
// config.setResumableCrawling(true);
// #实例化控制器
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
// #种子网址
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/");
// 爬虫并发数
int numberOfCrawlers = 7;
// #开始爬取网页(阻塞操作)
controller.start(MyCrawler.class, numberOfCrawlers);
}
use of edu.uci.ics.crawler4j.crawler.CrawlController in project crawler4j by yasserg.
the class MultipleCrawlerController method main.
public static void main(String[] args) throws Exception {
// The folder where intermediate crawl data is stored (e.g. list of urls that are extracted from previously
// fetched pages and need to be crawled later).
String crawlStorageFolder = "/tmp/crawler4j/";
CrawlConfig config1 = new CrawlConfig();
CrawlConfig config2 = new CrawlConfig();
// The two crawlers should have different storage folders for their intermediate data.
config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");
config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");
config1.setPolitenessDelay(1000);
config2.setPolitenessDelay(2000);
config1.setMaxPagesToFetch(50);
config2.setMaxPagesToFetch(100);
// We will use different PageFetchers for the two crawlers.
PageFetcher pageFetcher1 = new PageFetcher(config1);
PageFetcher pageFetcher2 = new PageFetcher(config2);
// We will use the same RobotstxtServer for both of the crawlers.
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1);
CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);
List<String> crawler1Domains = ImmutableList.of("https://www.ics.uci.edu/", "https://www.cnn.com/");
List<String> crawler2Domains = ImmutableList.of("https://en.wikipedia.org/");
controller1.addSeed("https://www.ics.uci.edu/");
controller1.addSeed("https://www.cnn.com/");
controller1.addSeed("https://www.ics.uci.edu/~lopes/");
controller1.addSeed("https://www.cnn.com/POLITICS/");
controller2.addSeed("https://en.wikipedia.org/wiki/Main_Page");
controller2.addSeed("https://en.wikipedia.org/wiki/Obama");
controller2.addSeed("https://en.wikipedia.org/wiki/Bing");
CrawlController.WebCrawlerFactory<BasicCrawler> factory1 = () -> new BasicCrawler(crawler1Domains);
CrawlController.WebCrawlerFactory<BasicCrawler> factory2 = () -> new BasicCrawler(crawler2Domains);
// The first crawler will have 5 concurrent threads and the second crawler will have 7 threads.
controller1.startNonBlocking(factory1, 5);
controller2.startNonBlocking(factory2, 7);
controller1.waitUntilFinish();
logger.info("Crawler 1 is finished.");
controller2.waitUntilFinish();
logger.info("Crawler 2 is finished.");
}
use of edu.uci.ics.crawler4j.crawler.CrawlController in project crawler4j by yasserg.
the class ControllerWithShutdown method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
logger.info("Needed parameters: ");
logger.info("\t rootFolder (it will contain intermediate crawl data)");
logger.info("\t numberOfCralwers (number of concurrent threads)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
/*
* numberOfCrawlers shows the number of concurrent threads that should
* be initiated for crawling.
*/
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(1000);
// Unlimited number of pages can be crawled.
config.setMaxPagesToFetch(-1);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("https://www.ics.uci.edu/~welling/");
controller.addSeed("https://www.ics.uci.edu/~lopes/");
controller.addSeed("https://www.ics.uci.edu/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);
// Wait for 30 seconds
Thread.sleep(30 * 1000);
// Send the shutdown request and then wait for finishing
controller.shutdown();
controller.waitUntilFinish();
}
Aggregations