use of edu.uci.ics.crawler4j.crawler.CrawlConfig in project yyl_example by Relucent.
the class MyControllerTest method main.
public static void main(String[] args) throws Exception {
//#爬虫配置
CrawlConfig config = new CrawlConfig();
//#设置爬取深度
config.setMaxDepthOfCrawling(5);
//#设置页面抓取的最大数量
config.setMaxPagesToFetch(Integer.MAX_VALUE);
//每次请求前等待200毫秒
config.setPolitenessDelay(200);
//爬取数据存储文件夹
config.setCrawlStorageFolder(System.getProperty("user.dir") + "/temp/crawl");
//#设置代理
//config.setProxyHost("proxyserver.example.com");
//config.setProxyPort(8080);
//config.setProxyUsername(username);
//config.getProxyPassword(password);
//#配置恢复停止/崩溃的爬虫
//config.setResumableCrawling(true);
//#实例化控制器
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
//#种子网址
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/");
//爬虫并发数
int numberOfCrawlers = 7;
//#开始爬取网页(阻塞操作)
controller.start(MyCrawler.class, numberOfCrawlers);
}
use of edu.uci.ics.crawler4j.crawler.CrawlConfig in project mastering-java by Kingminghuang.
the class Controller method main.
public static void main(String[] args) {
String crawlStorageFolder = "D:\\crawl\\data";
int numberOfCrawlers = 7;
CrawlConfig crawlConfig = new CrawlConfig();
crawlConfig.setCrawlStorageFolder(crawlStorageFolder);
crawlConfig.setIncludeHttpsPages(true);
crawlConfig.setProxyHost("cn-proxy.jp.oracle.com");
crawlConfig.setProxyPort(80);
crawlConfig.setPolitenessDelay(POLITENESS_DELAY);
crawlConfig.setMaxDepthOfCrawling(MAX_DEPTH_OF_CRAWLING);
PageFetcher pageFetcher = new PageFetcher(crawlConfig);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
try {
CrawlController crawlController = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
crawlController.addSeed("http://www.ics.uci.edu/~loops/");
crawlController.addSeed("http://www.ics.uci.edu/~welling");
crawlController.addSeed("http://www.ics.uci.edu/");
crawlController.start(Crawler.class, numberOfCrawlers);
} catch (Exception e) {
e.printStackTrace();
}
}
use of edu.uci.ics.crawler4j.crawler.CrawlConfig in project crawler4j by yasserg.
the class BasicCrawlController method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
logger.info("Needed parameters: ");
logger.info("\t rootFolder (it will contain intermediate crawl data)");
logger.info("\t numberOfCralwers (number of concurrent threads)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
/*
* numberOfCrawlers shows the number of concurrent threads that should
* be initiated for crawling.
*/
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Be polite: Make sure that we don't send more than 1 request per
* second (1000 milliseconds between requests).
*/
config.setPolitenessDelay(1000);
/*
* You can set the maximum crawl depth here. The default value is -1 for
* unlimited depth
*/
config.setMaxDepthOfCrawling(2);
/*
* You can set the maximum number of pages to crawl. The default value
* is -1 for unlimited number of pages
*/
config.setMaxPagesToFetch(1000);
/**
* Do you want crawler4j to crawl also binary data ?
* example: the contents of pdf, or the metadata of images etc
*/
config.setIncludeBinaryContentInCrawling(false);
/*
* Do you need to set a proxy? If so, you can use:
* config.setProxyHost("proxyserver.example.com");
* config.setProxyPort(8080);
*
* If your proxy also needs authentication:
* config.setProxyUsername(username); config.getProxyPassword(password);
*/
/*
* This config parameter can be used to set your crawl to be resumable
* (meaning that you can resume the crawl from a previously
* interrupted/crashed crawl). Note: if you enable resuming feature and
* want to start a fresh crawl, you need to delete the contents of
* rootFolder manually.
*/
config.setResumableCrawling(false);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/");
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/~welling/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(BasicCrawler.class, numberOfCrawlers);
}
use of edu.uci.ics.crawler4j.crawler.CrawlConfig in project crawler4j by yasserg.
the class LocalDataCollectorController method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
logger.info("Needed parameters: ");
logger.info("\t rootFolder (it will contain intermediate crawl data)");
logger.info("\t numberOfCralwers (number of concurrent threads)");
return;
}
String rootFolder = args[0];
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(rootFolder);
config.setMaxPagesToFetch(10);
config.setPolitenessDelay(1000);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("http://www.ics.uci.edu/");
controller.start(LocalDataCollectorCrawler.class, numberOfCrawlers);
List<Object> crawlersLocalData = controller.getCrawlersLocalData();
long totalLinks = 0;
long totalTextSize = 0;
int totalProcessedPages = 0;
for (Object localData : crawlersLocalData) {
CrawlStat stat = (CrawlStat) localData;
totalLinks += stat.getTotalLinks();
totalTextSize += stat.getTotalTextSize();
totalProcessedPages += stat.getTotalProcessedPages();
}
logger.info("Aggregated Statistics:");
logger.info("\tProcessed Pages: {}", totalProcessedPages);
logger.info("\tTotal Links found: {}", totalLinks);
logger.info("\tTotal Text Size: {}", totalTextSize);
}
Aggregations