use of com.zeroized.spider.repo.elastic.ElasticRepo in project Zpider by zeroized.
the class CrawlerController method setup.
//
// @RequestMapping(method = RequestMethod.POST,value = "/start")
// public String setup(@RequestParam CrawlerOptions crawlerOptions,
// @RequestParam String[] seeds) throws Exception {
// CrawlControllerOptions options=CrawlControllerFactory.defaultOptions();
// CrawlController crawlController=crawlControllerFactory.newController(options);
// CrawlerFactory crawlerFactory=new CrawlerFactory(crawlerOptions,mongoRepo);
// for (String seed : seeds) {
// crawlController.addSeed(seed);
// }
// crawlController.startNonBlocking(crawlerFactory,options.getWorkers());
// return "";
// }
// @RequestMapping(method = RequestMethod.POST,value = "/start")
// public String setup(@RequestParam List<String> seeds,
// @RequestParam List<String> allowDomains,
// @RequestParam List<String> crawlUrlPrefixes,
// @RequestParam List<Column> columns) throws Exception {
// System.out.println("/crawl/start visited");
// System.out.println(seeds);
// System.out.println(allowDomains);
// System.out.println(crawlUrlPrefixes);
// System.out.println(columns);
// CrawlControllerOptions options=CrawlControllerOptions.defaultOptions();
// CrawlController crawlController=crawlControllerFactory.newController(options);
// CrawlerOptions crawlerOptions=new CrawlerOptions(allowDomains,crawlUrlPrefixes,columns);
// CrawlerFactory crawlerFactory=new CrawlerFactory(crawlerOptions,mongoRepo);
// for (String seed:seeds){
// crawlController.addSeed(seed);
// }
// crawlController.startNonBlocking(crawlerFactory,options.getWorkers());
// return "";
// }
@RequestMapping(method = RequestMethod.POST, value = "/start")
public String setup(@RequestBody CrawlRequest crawlRequest) throws Exception {
System.out.println("/crawl/start visited");
List<String> seeds = crawlRequest.getSeeds();
List<String> allowDomains = crawlRequest.getAllowDomains().stream().map(x -> x.startsWith("http://") ? x : "http://" + x).collect(Collectors.toList());
List<String> crawlUrlPrefixes = crawlRequest.getCrawlUrlPrefixes().stream().map(x -> x.startsWith("http://") ? x : "http://" + x).collect(Collectors.toList());
List<Column> columns = crawlRequest.getColumns();
CrawlControllerOptions options = CrawlControllerOptions.defaultOptions();
options.setWorkers(crawlRequest.getAdvancedOpt().getWorkers());
options.setDelay(crawlRequest.getAdvancedOpt().getPoliteWait());
options.setDepth(crawlRequest.getAdvancedOpt().getMaxDepth());
options.setPage(crawlRequest.getAdvancedOpt().getMaxPage());
options.setDir(crawlRequest.getName() + "\\");
CrawlController crawlController = crawlControllerFactory.newController(options);
PublishSubject<Map<String, ?>> crawlSubject = PublishSubject.create();
crawlSubject.buffer(60, TimeUnit.SECONDS, Schedulers.computation(), 20, () -> Collections.synchronizedList(new LinkedList<>()), true).subscribe(elasticRepo::generateBulkIndex);
CrawlerOptions crawlerOptions = new CrawlerOptions(allowDomains, crawlUrlPrefixes, columns);
System.out.println(crawlerOptions.toString());
CrawlerFactory crawlerFactory = new CrawlerFactory(crawlerOptions, crawlSubject);
for (String seed : seeds) {
crawlController.addSeed(seed);
}
crawlController.startNonBlocking(crawlerFactory, options.getWorkers());
return "";
}
Aggregations