Search in sources :

Example 1 with CrawlerFactory

use of com.zeroized.spider.crawler.CrawlerFactory in project Zpider by zeroized.

the class CrawlerController method setup.

// 
// @RequestMapping(method = RequestMethod.POST,value = "/start")
// public String setup(@RequestParam CrawlerOptions crawlerOptions,
// @RequestParam String[] seeds) throws Exception {
// CrawlControllerOptions options=CrawlControllerFactory.defaultOptions();
// CrawlController crawlController=crawlControllerFactory.newController(options);
// CrawlerFactory crawlerFactory=new CrawlerFactory(crawlerOptions,mongoRepo);
// for (String seed : seeds) {
// crawlController.addSeed(seed);
// }
// crawlController.startNonBlocking(crawlerFactory,options.getWorkers());
// return "";
// }
// @RequestMapping(method = RequestMethod.POST,value = "/start")
// public String setup(@RequestParam List<String> seeds,
// @RequestParam List<String> allowDomains,
// @RequestParam List<String> crawlUrlPrefixes,
// @RequestParam List<Column> columns) throws Exception {
// System.out.println("/crawl/start visited");
// System.out.println(seeds);
// System.out.println(allowDomains);
// System.out.println(crawlUrlPrefixes);
// System.out.println(columns);
// CrawlControllerOptions options=CrawlControllerOptions.defaultOptions();
// CrawlController crawlController=crawlControllerFactory.newController(options);
// CrawlerOptions crawlerOptions=new CrawlerOptions(allowDomains,crawlUrlPrefixes,columns);
// CrawlerFactory crawlerFactory=new CrawlerFactory(crawlerOptions,mongoRepo);
// for (String seed:seeds){
// crawlController.addSeed(seed);
// }
// crawlController.startNonBlocking(crawlerFactory,options.getWorkers());
// return "";
// }
@RequestMapping(method = RequestMethod.POST, value = "/start")
public String setup(@RequestBody CrawlRequest crawlRequest) throws Exception {
    System.out.println("/crawl/start visited");
    List<String> seeds = crawlRequest.getSeeds();
    List<String> allowDomains = crawlRequest.getAllowDomains().stream().map(x -> x.startsWith("http://") ? x : "http://" + x).collect(Collectors.toList());
    List<String> crawlUrlPrefixes = crawlRequest.getCrawlUrlPrefixes().stream().map(x -> x.startsWith("http://") ? x : "http://" + x).collect(Collectors.toList());
    List<Column> columns = crawlRequest.getColumns();
    CrawlControllerOptions options = CrawlControllerOptions.defaultOptions();
    options.setWorkers(crawlRequest.getAdvancedOpt().getWorkers());
    options.setDelay(crawlRequest.getAdvancedOpt().getPoliteWait());
    options.setDepth(crawlRequest.getAdvancedOpt().getMaxDepth());
    options.setPage(crawlRequest.getAdvancedOpt().getMaxPage());
    options.setDir(crawlRequest.getName() + "\\");
    CrawlController crawlController = crawlControllerFactory.newController(options);
    PublishSubject<Map<String, ?>> crawlSubject = PublishSubject.create();
    crawlSubject.buffer(60, TimeUnit.SECONDS, Schedulers.computation(), 20, () -> Collections.synchronizedList(new LinkedList<>()), true).subscribe(elasticRepo::generateBulkIndex);
    CrawlerOptions crawlerOptions = new CrawlerOptions(allowDomains, crawlUrlPrefixes, columns);
    System.out.println(crawlerOptions.toString());
    CrawlerFactory crawlerFactory = new CrawlerFactory(crawlerOptions, crawlSubject);
    for (String seed : seeds) {
        crawlController.addSeed(seed);
    }
    crawlController.startNonBlocking(crawlerFactory, options.getWorkers());
    return "";
}
Also used : CrawlerOptions(com.zeroized.spider.crawler.CrawlerOptions) Autowired(org.springframework.beans.factory.annotation.Autowired) RequestMapping(org.springframework.web.bind.annotation.RequestMapping) RequestMethod(org.springframework.web.bind.annotation.RequestMethod) RestController(org.springframework.web.bind.annotation.RestController) Collectors(java.util.stream.Collectors) RequestBody(org.springframework.web.bind.annotation.RequestBody) TimeUnit(java.util.concurrent.TimeUnit) CrawlControllerFactory(com.zeroized.spider.crawler.CrawlControllerFactory) List(java.util.List) PublishSubject(io.reactivex.subjects.PublishSubject) Map(java.util.Map) CrawlerFactory(com.zeroized.spider.crawler.CrawlerFactory) CrawlController(edu.uci.ics.crawler4j.crawler.CrawlController) CrawlControllerOptions(com.zeroized.spider.crawler.CrawlControllerOptions) Schedulers(io.reactivex.schedulers.Schedulers) ElasticRepo(com.zeroized.spider.repo.elastic.ElasticRepo) LinkedList(java.util.LinkedList) Column(com.zeroized.spider.domain.Column) Collections(java.util.Collections) CrawlRequest(com.zeroized.spider.domain.CrawlRequest) CrawlerOptions(com.zeroized.spider.crawler.CrawlerOptions) Column(com.zeroized.spider.domain.Column) CrawlController(edu.uci.ics.crawler4j.crawler.CrawlController) Map(java.util.Map) CrawlControllerOptions(com.zeroized.spider.crawler.CrawlControllerOptions) CrawlerFactory(com.zeroized.spider.crawler.CrawlerFactory) RequestMapping(org.springframework.web.bind.annotation.RequestMapping)

Aggregations

CrawlControllerFactory (com.zeroized.spider.crawler.CrawlControllerFactory)1 CrawlControllerOptions (com.zeroized.spider.crawler.CrawlControllerOptions)1 CrawlerFactory (com.zeroized.spider.crawler.CrawlerFactory)1 CrawlerOptions (com.zeroized.spider.crawler.CrawlerOptions)1 Column (com.zeroized.spider.domain.Column)1 CrawlRequest (com.zeroized.spider.domain.CrawlRequest)1 ElasticRepo (com.zeroized.spider.repo.elastic.ElasticRepo)1 CrawlController (edu.uci.ics.crawler4j.crawler.CrawlController)1 Schedulers (io.reactivex.schedulers.Schedulers)1 PublishSubject (io.reactivex.subjects.PublishSubject)1 Collections (java.util.Collections)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 Map (java.util.Map)1 TimeUnit (java.util.concurrent.TimeUnit)1 Collectors (java.util.stream.Collectors)1 Autowired (org.springframework.beans.factory.annotation.Autowired)1 RequestBody (org.springframework.web.bind.annotation.RequestBody)1 RequestMapping (org.springframework.web.bind.annotation.RequestMapping)1 RequestMethod (org.springframework.web.bind.annotation.RequestMethod)1