Search in sources :

Example 1 with CrawlController

use of edu.uci.ics.crawler4j.crawler.CrawlController in project Zpider by zeroized.

the class CrawlerController method setup.

// 
// @RequestMapping(method = RequestMethod.POST,value = "/start")
// public String setup(@RequestParam CrawlerOptions crawlerOptions,
// @RequestParam String[] seeds) throws Exception {
// CrawlControllerOptions options=CrawlControllerFactory.defaultOptions();
// CrawlController crawlController=crawlControllerFactory.newController(options);
// CrawlerFactory crawlerFactory=new CrawlerFactory(crawlerOptions,mongoRepo);
// for (String seed : seeds) {
// crawlController.addSeed(seed);
// }
// crawlController.startNonBlocking(crawlerFactory,options.getWorkers());
// return "";
// }
// @RequestMapping(method = RequestMethod.POST,value = "/start")
// public String setup(@RequestParam List<String> seeds,
// @RequestParam List<String> allowDomains,
// @RequestParam List<String> crawlUrlPrefixes,
// @RequestParam List<Column> columns) throws Exception {
// System.out.println("/crawl/start visited");
// System.out.println(seeds);
// System.out.println(allowDomains);
// System.out.println(crawlUrlPrefixes);
// System.out.println(columns);
// CrawlControllerOptions options=CrawlControllerOptions.defaultOptions();
// CrawlController crawlController=crawlControllerFactory.newController(options);
// CrawlerOptions crawlerOptions=new CrawlerOptions(allowDomains,crawlUrlPrefixes,columns);
// CrawlerFactory crawlerFactory=new CrawlerFactory(crawlerOptions,mongoRepo);
// for (String seed:seeds){
// crawlController.addSeed(seed);
// }
// crawlController.startNonBlocking(crawlerFactory,options.getWorkers());
// return "";
// }
@RequestMapping(method = RequestMethod.POST, value = "/start")
public String setup(@RequestBody CrawlRequest crawlRequest) throws Exception {
    System.out.println("/crawl/start visited");
    List<String> seeds = crawlRequest.getSeeds();
    List<String> allowDomains = crawlRequest.getAllowDomains().stream().map(x -> x.startsWith("http://") ? x : "http://" + x).collect(Collectors.toList());
    List<String> crawlUrlPrefixes = crawlRequest.getCrawlUrlPrefixes().stream().map(x -> x.startsWith("http://") ? x : "http://" + x).collect(Collectors.toList());
    List<Column> columns = crawlRequest.getColumns();
    CrawlControllerOptions options = CrawlControllerOptions.defaultOptions();
    options.setWorkers(crawlRequest.getAdvancedOpt().getWorkers());
    options.setDelay(crawlRequest.getAdvancedOpt().getPoliteWait());
    options.setDepth(crawlRequest.getAdvancedOpt().getMaxDepth());
    options.setPage(crawlRequest.getAdvancedOpt().getMaxPage());
    options.setDir(crawlRequest.getName() + "\\");
    CrawlController crawlController = crawlControllerFactory.newController(options);
    PublishSubject<Map<String, ?>> crawlSubject = PublishSubject.create();
    crawlSubject.buffer(60, TimeUnit.SECONDS, Schedulers.computation(), 20, () -> Collections.synchronizedList(new LinkedList<>()), true).subscribe(elasticRepo::generateBulkIndex);
    CrawlerOptions crawlerOptions = new CrawlerOptions(allowDomains, crawlUrlPrefixes, columns);
    System.out.println(crawlerOptions.toString());
    CrawlerFactory crawlerFactory = new CrawlerFactory(crawlerOptions, crawlSubject);
    for (String seed : seeds) {
        crawlController.addSeed(seed);
    }
    crawlController.startNonBlocking(crawlerFactory, options.getWorkers());
    return "";
}
Also used : CrawlerOptions(com.zeroized.spider.crawler.CrawlerOptions) Autowired(org.springframework.beans.factory.annotation.Autowired) RequestMapping(org.springframework.web.bind.annotation.RequestMapping) RequestMethod(org.springframework.web.bind.annotation.RequestMethod) RestController(org.springframework.web.bind.annotation.RestController) Collectors(java.util.stream.Collectors) RequestBody(org.springframework.web.bind.annotation.RequestBody) TimeUnit(java.util.concurrent.TimeUnit) CrawlControllerFactory(com.zeroized.spider.crawler.CrawlControllerFactory) List(java.util.List) PublishSubject(io.reactivex.subjects.PublishSubject) Map(java.util.Map) CrawlerFactory(com.zeroized.spider.crawler.CrawlerFactory) CrawlController(edu.uci.ics.crawler4j.crawler.CrawlController) CrawlControllerOptions(com.zeroized.spider.crawler.CrawlControllerOptions) Schedulers(io.reactivex.schedulers.Schedulers) ElasticRepo(com.zeroized.spider.repo.elastic.ElasticRepo) LinkedList(java.util.LinkedList) Column(com.zeroized.spider.domain.Column) Collections(java.util.Collections) CrawlRequest(com.zeroized.spider.domain.CrawlRequest) CrawlerOptions(com.zeroized.spider.crawler.CrawlerOptions) Column(com.zeroized.spider.domain.Column) CrawlController(edu.uci.ics.crawler4j.crawler.CrawlController) Map(java.util.Map) CrawlControllerOptions(com.zeroized.spider.crawler.CrawlControllerOptions) CrawlerFactory(com.zeroized.spider.crawler.CrawlerFactory) RequestMapping(org.springframework.web.bind.annotation.RequestMapping)

Example 2 with CrawlController

use of edu.uci.ics.crawler4j.crawler.CrawlController in project Zpider by zeroized.

the class CrawlControllerFactory method newController.

public CrawlController newController(CrawlControllerOptions options) throws Exception {
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(baseDir + options.getDir());
    config.setMaxDepthOfCrawling(options.getDepth());
    config.setPolitenessDelay(options.getDelay());
    config.setResumableCrawling(options.isResumeable());
    config.setDefaultHeaders(options.getHeaders());
    if (options.getPage() != -1) {
        config.setMaxPagesToFetch(options.getPage());
    }
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    return new CrawlController(config, pageFetcher, robotstxtServer);
}
Also used : PageFetcher(edu.uci.ics.crawler4j.fetcher.PageFetcher) RobotstxtServer(edu.uci.ics.crawler4j.robotstxt.RobotstxtServer) CrawlController(edu.uci.ics.crawler4j.crawler.CrawlController) CrawlConfig(edu.uci.ics.crawler4j.crawler.CrawlConfig) RobotstxtConfig(edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig)

Example 3 with CrawlController

use of edu.uci.ics.crawler4j.crawler.CrawlController in project yyl_example by Relucent.

the class MyControllerTest method main.

public static void main(String[] args) throws Exception {
    // #爬虫配置
    CrawlConfig config = new CrawlConfig();
    // #设置爬取深度
    config.setMaxDepthOfCrawling(5);
    // #设置页面抓取的最大数量
    config.setMaxPagesToFetch(Integer.MAX_VALUE);
    // 每次请求前等待200毫秒
    config.setPolitenessDelay(200);
    // 爬取数据存储文件夹
    config.setCrawlStorageFolder(System.getProperty("user.dir") + "/temp/crawl");
    // #设置代理
    // config.setProxyHost("proxyserver.example.com");
    // config.setProxyPort(8080);
    // config.setProxyUsername(username);
    // config.getProxyPassword(password);
    // #配置恢复停止/崩溃的爬虫
    // config.setResumableCrawling(true);
    // #实例化控制器
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    // #种子网址
    controller.addSeed("http://www.ics.uci.edu/~lopes/");
    controller.addSeed("http://www.ics.uci.edu/~welling/");
    controller.addSeed("http://www.ics.uci.edu/");
    // 爬虫并发数
    int numberOfCrawlers = 7;
    // #开始爬取网页(阻塞操作)
    controller.start(MyCrawler.class, numberOfCrawlers);
}
Also used : PageFetcher(edu.uci.ics.crawler4j.fetcher.PageFetcher) RobotstxtServer(edu.uci.ics.crawler4j.robotstxt.RobotstxtServer) CrawlController(edu.uci.ics.crawler4j.crawler.CrawlController) CrawlConfig(edu.uci.ics.crawler4j.crawler.CrawlConfig) RobotstxtConfig(edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig)

Example 4 with CrawlController

use of edu.uci.ics.crawler4j.crawler.CrawlController in project crawler4j by yasserg.

the class MultipleCrawlerController method main.

public static void main(String[] args) throws Exception {
    // The folder where intermediate crawl data is stored (e.g. list of urls that are extracted from previously
    // fetched pages and need to be crawled later).
    String crawlStorageFolder = "/tmp/crawler4j/";
    CrawlConfig config1 = new CrawlConfig();
    CrawlConfig config2 = new CrawlConfig();
    // The two crawlers should have different storage folders for their intermediate data.
    config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");
    config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");
    config1.setPolitenessDelay(1000);
    config2.setPolitenessDelay(2000);
    config1.setMaxPagesToFetch(50);
    config2.setMaxPagesToFetch(100);
    // We will use different PageFetchers for the two crawlers.
    PageFetcher pageFetcher1 = new PageFetcher(config1);
    PageFetcher pageFetcher2 = new PageFetcher(config2);
    // We will use the same RobotstxtServer for both of the crawlers.
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1);
    CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
    CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);
    List<String> crawler1Domains = ImmutableList.of("https://www.ics.uci.edu/", "https://www.cnn.com/");
    List<String> crawler2Domains = ImmutableList.of("https://en.wikipedia.org/");
    controller1.addSeed("https://www.ics.uci.edu/");
    controller1.addSeed("https://www.cnn.com/");
    controller1.addSeed("https://www.ics.uci.edu/~lopes/");
    controller1.addSeed("https://www.cnn.com/POLITICS/");
    controller2.addSeed("https://en.wikipedia.org/wiki/Main_Page");
    controller2.addSeed("https://en.wikipedia.org/wiki/Obama");
    controller2.addSeed("https://en.wikipedia.org/wiki/Bing");
    CrawlController.WebCrawlerFactory<BasicCrawler> factory1 = () -> new BasicCrawler(crawler1Domains);
    CrawlController.WebCrawlerFactory<BasicCrawler> factory2 = () -> new BasicCrawler(crawler2Domains);
    // The first crawler will have 5 concurrent threads and the second crawler will have 7 threads.
    controller1.startNonBlocking(factory1, 5);
    controller2.startNonBlocking(factory2, 7);
    controller1.waitUntilFinish();
    logger.info("Crawler 1 is finished.");
    controller2.waitUntilFinish();
    logger.info("Crawler 2 is finished.");
}
Also used : PageFetcher(edu.uci.ics.crawler4j.fetcher.PageFetcher) RobotstxtServer(edu.uci.ics.crawler4j.robotstxt.RobotstxtServer) CrawlController(edu.uci.ics.crawler4j.crawler.CrawlController) CrawlConfig(edu.uci.ics.crawler4j.crawler.CrawlConfig) RobotstxtConfig(edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig)

Example 5 with CrawlController

use of edu.uci.ics.crawler4j.crawler.CrawlController in project crawler4j by yasserg.

the class ControllerWithShutdown method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        logger.info("Needed parameters: ");
        logger.info("\t rootFolder (it will contain intermediate crawl data)");
        logger.info("\t numberOfCralwers (number of concurrent threads)");
        return;
    }
    /*
     * crawlStorageFolder is a folder where intermediate crawl data is
     * stored.
     */
    String crawlStorageFolder = args[0];
    /*
     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
     */
    int numberOfCrawlers = Integer.parseInt(args[1]);
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorageFolder);
    config.setPolitenessDelay(1000);
    // Unlimited number of pages can be crawled.
    config.setMaxPagesToFetch(-1);
    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */
    controller.addSeed("https://www.ics.uci.edu/~welling/");
    controller.addSeed("https://www.ics.uci.edu/~lopes/");
    controller.addSeed("https://www.ics.uci.edu/");
    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */
    controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);
    // Wait for 30 seconds
    Thread.sleep(30 * 1000);
    // Send the shutdown request and then wait for finishing
    controller.shutdown();
    controller.waitUntilFinish();
}
Also used : PageFetcher(edu.uci.ics.crawler4j.fetcher.PageFetcher) RobotstxtServer(edu.uci.ics.crawler4j.robotstxt.RobotstxtServer) CrawlController(edu.uci.ics.crawler4j.crawler.CrawlController) CrawlConfig(edu.uci.ics.crawler4j.crawler.CrawlConfig) RobotstxtConfig(edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig)

Aggregations

CrawlController (edu.uci.ics.crawler4j.crawler.CrawlController)11 CrawlConfig (edu.uci.ics.crawler4j.crawler.CrawlConfig)10 PageFetcher (edu.uci.ics.crawler4j.fetcher.PageFetcher)10 RobotstxtConfig (edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig)10 RobotstxtServer (edu.uci.ics.crawler4j.robotstxt.RobotstxtServer)10 ComboPooledDataSource (com.mchange.v2.c3p0.ComboPooledDataSource)1 CrawlControllerFactory (com.zeroized.spider.crawler.CrawlControllerFactory)1 CrawlControllerOptions (com.zeroized.spider.crawler.CrawlControllerOptions)1 CrawlerFactory (com.zeroized.spider.crawler.CrawlerFactory)1 CrawlerOptions (com.zeroized.spider.crawler.CrawlerOptions)1 Column (com.zeroized.spider.domain.Column)1 CrawlRequest (com.zeroized.spider.domain.CrawlRequest)1 ElasticRepo (com.zeroized.spider.repo.elastic.ElasticRepo)1 PostgresCrawlerFactory (edu.uci.ics.crawler4j.examples.crawler.PostgresCrawlerFactory)1 Schedulers (io.reactivex.schedulers.Schedulers)1 PublishSubject (io.reactivex.subjects.PublishSubject)1 File (java.io.File)1 Collections (java.util.Collections)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1