Examples with RobotstxtConfig - edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig

Example 1 with RobotstxtConfig

use of edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig in project crawler4j by yasserg.

the class ImageCrawlController method main.

public static void main(String[] args) throws Exception {
    if (args.length < 3) {
        logger.info("Needed parameters: ");
        logger.info("\t rootFolder (it will contain intermediate crawl data)");
        logger.info("\t numberOfCralwers (number of concurrent threads)");
        logger.info("\t storageFolder (a folder for storing downloaded images)");
        return;
    }
    String rootFolder = args[0];
    int numberOfCrawlers = Integer.parseInt(args[1]);
    String storageFolder = args[2];
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(rootFolder);
    /*
     * Since images are binary content, we need to set this parameter to
     * true to make sure they are included in the crawl.
     */
    config.setIncludeBinaryContentInCrawling(true);
    String[] crawlDomains = { "http://uci.edu/" };
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    for (String domain : crawlDomains) {
        controller.addSeed(domain);
    }
    ImageCrawler.configure(crawlDomains, storageFolder);
    controller.start(ImageCrawler.class, numberOfCrawlers);
}

Also used : PageFetcher(edu.uci.ics.crawler4j.fetcher.PageFetcher) RobotstxtServer(edu.uci.ics.crawler4j.robotstxt.RobotstxtServer) CrawlController(edu.uci.ics.crawler4j.crawler.CrawlController) CrawlConfig(edu.uci.ics.crawler4j.crawler.CrawlConfig) RobotstxtConfig(edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig)

Example 2 with RobotstxtConfig

use of edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig in project crawler4j by yasserg.

the class MultipleCrawlerController method main.

public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        logger.info("Needed parameter: ");
        logger.info("\t rootFolder (it will contain intermediate crawl data)");
        return;
    }
    /*
     * crawlStorageFolder is a folder where intermediate crawl data is
     * stored.
     */
    String crawlStorageFolder = args[0];
    CrawlConfig config1 = new CrawlConfig();
    CrawlConfig config2 = new CrawlConfig();
    /*
     * The two crawlers should have different storage folders for their
     * intermediate data
     */
    config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");
    config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");
    config1.setPolitenessDelay(1000);
    config2.setPolitenessDelay(2000);
    config1.setMaxPagesToFetch(50);
    config2.setMaxPagesToFetch(100);
    /*
     * We will use different PageFetchers for the two crawlers.
     */
    PageFetcher pageFetcher1 = new PageFetcher(config1);
    PageFetcher pageFetcher2 = new PageFetcher(config2);
    /*
     * We will use the same RobotstxtServer for both of the crawlers.
     */
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1);
    CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
    CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);
    String[] crawler1Domains = { "http://www.ics.uci.edu/", "http://www.cnn.com/" };
    String[] crawler2Domains = { "http://en.wikipedia.org/" };
    controller1.setCustomData(crawler1Domains);
    controller2.setCustomData(crawler2Domains);
    controller1.addSeed("http://www.ics.uci.edu/");
    controller1.addSeed("http://www.cnn.com/");
    controller1.addSeed("http://www.ics.uci.edu/~lopes/");
    controller1.addSeed("http://www.cnn.com/POLITICS/");
    controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");
    controller2.addSeed("http://en.wikipedia.org/wiki/Obama");
    controller2.addSeed("http://en.wikipedia.org/wiki/Bing");
    /*
     * The first crawler will have 5 concurrent threads and the second
     * crawler will have 7 threads.
     */
    controller1.startNonBlocking(BasicCrawler.class, 5);
    controller2.startNonBlocking(BasicCrawler.class, 7);
    controller1.waitUntilFinish();
    logger.info("Crawler 1 is finished.");
    controller2.waitUntilFinish();
    logger.info("Crawler 2 is finished.");
}

Example 3 with RobotstxtConfig

use of edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig in project crawler4j by yasserg.

the class ControllerWithShutdown method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        logger.info("Needed parameters: ");
        logger.info("\t rootFolder (it will contain intermediate crawl data)");
        logger.info("\t numberOfCralwers (number of concurrent threads)");
        return;
    }
    /*
     * crawlStorageFolder is a folder where intermediate crawl data is
     * stored.
     */
    String crawlStorageFolder = args[0];
    /*
     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
     */
    int numberOfCrawlers = Integer.parseInt(args[1]);
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorageFolder);
    config.setPolitenessDelay(1000);
    // Unlimited number of pages can be crawled.
    config.setMaxPagesToFetch(-1);
    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */
    controller.addSeed("http://www.ics.uci.edu/~welling/");
    controller.addSeed("http://www.ics.uci.edu/~lopes/");
    controller.addSeed("http://www.ics.uci.edu/");
    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */
    controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);
    // Wait for 30 seconds
    Thread.sleep(30 * 1000);
    // Send the shutdown request and then wait for finishing
    controller.shutdown();
    controller.waitUntilFinish();
}

Example 4 with RobotstxtConfig

use of edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig in project crawler4j by yasserg.

the class StatusHandlerCrawlController method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        logger.info("Needed parameters: ");
        logger.info("\t rootFolder (it will contain intermediate crawl data)");
        logger.info("\t numberOfCralwers (number of concurrent threads)");
        return;
    }
    /*
     * crawlStorageFolder is a folder where intermediate crawl data is
     * stored.
     */
    String crawlStorageFolder = args[0];
    /*
     * numberOfCrawlers shows the number of concurrent threads that should
     * be initiated for crawling.
     */
    int numberOfCrawlers = Integer.parseInt(args[1]);
    CrawlConfig config = new CrawlConfig();
    config.setCrawlStorageFolder(crawlStorageFolder);
    /*
     * Be polite: Make sure that we don't send more than 1 request per
     * second (1000 milliseconds between requests).
     */
    config.setPolitenessDelay(1000);
    /*
     * You can set the maximum crawl depth here. The default value is -1 for
     * unlimited depth
     */
    config.setMaxDepthOfCrawling(2);
    /*
     * You can set the maximum number of pages to crawl. The default value
     * is -1 for unlimited number of pages
     */
    config.setMaxPagesToFetch(1000);
    /*
     * Do you need to set a proxy? If so, you can use:
     * config.setProxyHost("proxyserver.example.com");
     * config.setProxyPort(8080);
     *
     * If your proxy also needs authentication:
     * config.setProxyUsername(username); config.getProxyPassword(password);
     */
    /*
     * This config parameter can be used to set your crawl to be resumable
     * (meaning that you can resume the crawl from a previously
     * interrupted/crashed crawl). Note: if you enable resuming feature and
     * want to start a fresh crawl, you need to delete the contents of
     * rootFolder manually.
     */
    config.setResumableCrawling(false);
    /*
     * Instantiate the controller for this crawl.
     */
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    /*
     * For each crawl, you need to add some seed urls. These are the first
     * URLs that are fetched and then the crawler starts following links
     * which are found in these pages
     */
    controller.addSeed("http://www.ics.uci.edu/~welling/");
    controller.addSeed("http://www.ics.uci.edu/~lopes/");
    controller.addSeed("http://www.ics.uci.edu/");
    /*
     * Start the crawl. This is a blocking operation, meaning that your code
     * will reach the line after this only when crawling is finished.
     */
    controller.start(StatusHandlerCrawler.class, numberOfCrawlers);
}

Example 5 with RobotstxtConfig

use of edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig in project yyl_example by Relucent.

the class MyControllerTest method main.

public static void main(String[] args) throws Exception {
    //#爬虫配置
    CrawlConfig config = new CrawlConfig();
    //#设置爬取深度
    config.setMaxDepthOfCrawling(5);
    //#设置页面抓取的最大数量 
    config.setMaxPagesToFetch(Integer.MAX_VALUE);
    //每次请求前等待200毫秒
    config.setPolitenessDelay(200);
    //爬取数据存储文件夹
    config.setCrawlStorageFolder(System.getProperty("user.dir") + "/temp/crawl");
    //#设置代理
    //config.setProxyHost("proxyserver.example.com");
    //config.setProxyPort(8080);
    //config.setProxyUsername(username);
    //config.getProxyPassword(password);
    //#配置恢复停止/崩溃的爬虫
    //config.setResumableCrawling(true);
    //#实例化控制器 
    PageFetcher pageFetcher = new PageFetcher(config);
    RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
    RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
    CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
    //#种子网址
    controller.addSeed("http://www.ics.uci.edu/~lopes/");
    controller.addSeed("http://www.ics.uci.edu/~welling/");
    controller.addSeed("http://www.ics.uci.edu/");
    //爬虫并发数
    int numberOfCrawlers = 7;
    //#开始爬取网页(阻塞操作)
    controller.start(MyCrawler.class, numberOfCrawlers);
}

Aggregations

RobotstxtConfig (edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig)9 CrawlConfig (edu.uci.ics.crawler4j.crawler.CrawlConfig)8 CrawlController (edu.uci.ics.crawler4j.crawler.CrawlController)8 PageFetcher (edu.uci.ics.crawler4j.fetcher.PageFetcher)8 RobotstxtServer (edu.uci.ics.crawler4j.robotstxt.RobotstxtServer)8 HostDirectives (edu.uci.ics.crawler4j.robotstxt.HostDirectives)1 Test (org.junit.Test)1