Search in sources :

Example 1 with Column

use of com.zeroized.spider.domain.Column in project Zpider by zeroized.

the class CrawlerController method setup.

// 
// @RequestMapping(method = RequestMethod.POST,value = "/start")
// public String setup(@RequestParam CrawlerOptions crawlerOptions,
// @RequestParam String[] seeds) throws Exception {
// CrawlControllerOptions options=CrawlControllerFactory.defaultOptions();
// CrawlController crawlController=crawlControllerFactory.newController(options);
// CrawlerFactory crawlerFactory=new CrawlerFactory(crawlerOptions,mongoRepo);
// for (String seed : seeds) {
// crawlController.addSeed(seed);
// }
// crawlController.startNonBlocking(crawlerFactory,options.getWorkers());
// return "";
// }
// @RequestMapping(method = RequestMethod.POST,value = "/start")
// public String setup(@RequestParam List<String> seeds,
// @RequestParam List<String> allowDomains,
// @RequestParam List<String> crawlUrlPrefixes,
// @RequestParam List<Column> columns) throws Exception {
// System.out.println("/crawl/start visited");
// System.out.println(seeds);
// System.out.println(allowDomains);
// System.out.println(crawlUrlPrefixes);
// System.out.println(columns);
// CrawlControllerOptions options=CrawlControllerOptions.defaultOptions();
// CrawlController crawlController=crawlControllerFactory.newController(options);
// CrawlerOptions crawlerOptions=new CrawlerOptions(allowDomains,crawlUrlPrefixes,columns);
// CrawlerFactory crawlerFactory=new CrawlerFactory(crawlerOptions,mongoRepo);
// for (String seed:seeds){
// crawlController.addSeed(seed);
// }
// crawlController.startNonBlocking(crawlerFactory,options.getWorkers());
// return "";
// }
@RequestMapping(method = RequestMethod.POST, value = "/start")
public String setup(@RequestBody CrawlRequest crawlRequest) throws Exception {
    System.out.println("/crawl/start visited");
    List<String> seeds = crawlRequest.getSeeds();
    List<String> allowDomains = crawlRequest.getAllowDomains().stream().map(x -> x.startsWith("http://") ? x : "http://" + x).collect(Collectors.toList());
    List<String> crawlUrlPrefixes = crawlRequest.getCrawlUrlPrefixes().stream().map(x -> x.startsWith("http://") ? x : "http://" + x).collect(Collectors.toList());
    List<Column> columns = crawlRequest.getColumns();
    CrawlControllerOptions options = CrawlControllerOptions.defaultOptions();
    options.setWorkers(crawlRequest.getAdvancedOpt().getWorkers());
    options.setDelay(crawlRequest.getAdvancedOpt().getPoliteWait());
    options.setDepth(crawlRequest.getAdvancedOpt().getMaxDepth());
    options.setPage(crawlRequest.getAdvancedOpt().getMaxPage());
    options.setDir(crawlRequest.getName() + "\\");
    CrawlController crawlController = crawlControllerFactory.newController(options);
    PublishSubject<Map<String, ?>> crawlSubject = PublishSubject.create();
    crawlSubject.buffer(60, TimeUnit.SECONDS, Schedulers.computation(), 20, () -> Collections.synchronizedList(new LinkedList<>()), true).subscribe(elasticRepo::generateBulkIndex);
    CrawlerOptions crawlerOptions = new CrawlerOptions(allowDomains, crawlUrlPrefixes, columns);
    System.out.println(crawlerOptions.toString());
    CrawlerFactory crawlerFactory = new CrawlerFactory(crawlerOptions, crawlSubject);
    for (String seed : seeds) {
        crawlController.addSeed(seed);
    }
    crawlController.startNonBlocking(crawlerFactory, options.getWorkers());
    return "";
}
Also used : CrawlerOptions(com.zeroized.spider.crawler.CrawlerOptions) Autowired(org.springframework.beans.factory.annotation.Autowired) RequestMapping(org.springframework.web.bind.annotation.RequestMapping) RequestMethod(org.springframework.web.bind.annotation.RequestMethod) RestController(org.springframework.web.bind.annotation.RestController) Collectors(java.util.stream.Collectors) RequestBody(org.springframework.web.bind.annotation.RequestBody) TimeUnit(java.util.concurrent.TimeUnit) CrawlControllerFactory(com.zeroized.spider.crawler.CrawlControllerFactory) List(java.util.List) PublishSubject(io.reactivex.subjects.PublishSubject) Map(java.util.Map) CrawlerFactory(com.zeroized.spider.crawler.CrawlerFactory) CrawlController(edu.uci.ics.crawler4j.crawler.CrawlController) CrawlControllerOptions(com.zeroized.spider.crawler.CrawlControllerOptions) Schedulers(io.reactivex.schedulers.Schedulers) ElasticRepo(com.zeroized.spider.repo.elastic.ElasticRepo) LinkedList(java.util.LinkedList) Column(com.zeroized.spider.domain.Column) Collections(java.util.Collections) CrawlRequest(com.zeroized.spider.domain.CrawlRequest) CrawlerOptions(com.zeroized.spider.crawler.CrawlerOptions) Column(com.zeroized.spider.domain.Column) CrawlController(edu.uci.ics.crawler4j.crawler.CrawlController) Map(java.util.Map) CrawlControllerOptions(com.zeroized.spider.crawler.CrawlControllerOptions) CrawlerFactory(com.zeroized.spider.crawler.CrawlerFactory) RequestMapping(org.springframework.web.bind.annotation.RequestMapping)

Example 2 with Column

use of com.zeroized.spider.domain.Column in project Zpider by zeroized.

the class Crawler method analyze.

private void analyze(Page page) {
    HtmlParseData htmlParseData = (HtmlParseData) page.getParseData();
    String html = htmlParseData.getHtml();
    Document doc = Jsoup.parse(html);
    Map<String, List<String>> data = new HashMap<>();
    for (Column column : crawlerOptions.getColumns()) {
        String name = column.getColumn();
        String rule = column.getRule();
        Elements eles = doc.select(rule);
        List<String> values = new ArrayList<>(eles.size());
        for (Element ele : eles) {
            values.add(ele.text());
        }
        data.put(name, values);
    }
    // System.out.println(data);
    publishSubject.onNext(data);
// System.out.println(data.toString());
}
Also used : HashMap(java.util.HashMap) Column(com.zeroized.spider.domain.Column) Element(org.jsoup.nodes.Element) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) Document(org.jsoup.nodes.Document) Elements(org.jsoup.select.Elements) HtmlParseData(edu.uci.ics.crawler4j.parser.HtmlParseData)

Aggregations

Column (com.zeroized.spider.domain.Column)2 List (java.util.List)2 CrawlControllerFactory (com.zeroized.spider.crawler.CrawlControllerFactory)1 CrawlControllerOptions (com.zeroized.spider.crawler.CrawlControllerOptions)1 CrawlerFactory (com.zeroized.spider.crawler.CrawlerFactory)1 CrawlerOptions (com.zeroized.spider.crawler.CrawlerOptions)1 CrawlRequest (com.zeroized.spider.domain.CrawlRequest)1 ElasticRepo (com.zeroized.spider.repo.elastic.ElasticRepo)1 CrawlController (edu.uci.ics.crawler4j.crawler.CrawlController)1 HtmlParseData (edu.uci.ics.crawler4j.parser.HtmlParseData)1 Schedulers (io.reactivex.schedulers.Schedulers)1 PublishSubject (io.reactivex.subjects.PublishSubject)1 ArrayList (java.util.ArrayList)1 Collections (java.util.Collections)1 HashMap (java.util.HashMap)1 LinkedList (java.util.LinkedList)1 Map (java.util.Map)1 TimeUnit (java.util.concurrent.TimeUnit)1 Collectors (java.util.stream.Collectors)1 Document (org.jsoup.nodes.Document)1