use of edu.uci.ics.crawler4j.robotstxt.RobotstxtServer in project crawler4j by yasserg.
the class ImageCrawlController method main.
public static void main(String[] args) throws Exception {
if (args.length < 3) {
logger.info("Needed parameters: ");
logger.info("\t rootFolder (it will contain intermediate crawl data)");
logger.info("\t numberOfCralwers (number of concurrent threads)");
logger.info("\t storageFolder (a folder for storing downloaded images)");
return;
}
String rootFolder = args[0];
int numberOfCrawlers = Integer.parseInt(args[1]);
String storageFolder = args[2];
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(rootFolder);
/*
* Since images are binary content, we need to set this parameter to
* true to make sure they are included in the crawl.
*/
config.setIncludeBinaryContentInCrawling(true);
String[] crawlDomains = { "http://uci.edu/" };
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
for (String domain : crawlDomains) {
controller.addSeed(domain);
}
ImageCrawler.configure(crawlDomains, storageFolder);
controller.start(ImageCrawler.class, numberOfCrawlers);
}
use of edu.uci.ics.crawler4j.robotstxt.RobotstxtServer in project crawler4j by yasserg.
the class MultipleCrawlerController method main.
public static void main(String[] args) throws Exception {
if (args.length != 1) {
logger.info("Needed parameter: ");
logger.info("\t rootFolder (it will contain intermediate crawl data)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
CrawlConfig config1 = new CrawlConfig();
CrawlConfig config2 = new CrawlConfig();
/*
* The two crawlers should have different storage folders for their
* intermediate data
*/
config1.setCrawlStorageFolder(crawlStorageFolder + "/crawler1");
config2.setCrawlStorageFolder(crawlStorageFolder + "/crawler2");
config1.setPolitenessDelay(1000);
config2.setPolitenessDelay(2000);
config1.setMaxPagesToFetch(50);
config2.setMaxPagesToFetch(100);
/*
* We will use different PageFetchers for the two crawlers.
*/
PageFetcher pageFetcher1 = new PageFetcher(config1);
PageFetcher pageFetcher2 = new PageFetcher(config2);
/*
* We will use the same RobotstxtServer for both of the crawlers.
*/
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher1);
CrawlController controller1 = new CrawlController(config1, pageFetcher1, robotstxtServer);
CrawlController controller2 = new CrawlController(config2, pageFetcher2, robotstxtServer);
String[] crawler1Domains = { "http://www.ics.uci.edu/", "http://www.cnn.com/" };
String[] crawler2Domains = { "http://en.wikipedia.org/" };
controller1.setCustomData(crawler1Domains);
controller2.setCustomData(crawler2Domains);
controller1.addSeed("http://www.ics.uci.edu/");
controller1.addSeed("http://www.cnn.com/");
controller1.addSeed("http://www.ics.uci.edu/~lopes/");
controller1.addSeed("http://www.cnn.com/POLITICS/");
controller2.addSeed("http://en.wikipedia.org/wiki/Main_Page");
controller2.addSeed("http://en.wikipedia.org/wiki/Obama");
controller2.addSeed("http://en.wikipedia.org/wiki/Bing");
/*
* The first crawler will have 5 concurrent threads and the second
* crawler will have 7 threads.
*/
controller1.startNonBlocking(BasicCrawler.class, 5);
controller2.startNonBlocking(BasicCrawler.class, 7);
controller1.waitUntilFinish();
logger.info("Crawler 1 is finished.");
controller2.waitUntilFinish();
logger.info("Crawler 2 is finished.");
}
use of edu.uci.ics.crawler4j.robotstxt.RobotstxtServer in project crawler4j by yasserg.
the class ControllerWithShutdown method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
logger.info("Needed parameters: ");
logger.info("\t rootFolder (it will contain intermediate crawl data)");
logger.info("\t numberOfCralwers (number of concurrent threads)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
/*
* numberOfCrawlers shows the number of concurrent threads that should
* be initiated for crawling.
*/
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
config.setPolitenessDelay(1000);
// Unlimited number of pages can be crawled.
config.setMaxPagesToFetch(-1);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.startNonBlocking(BasicCrawler.class, numberOfCrawlers);
// Wait for 30 seconds
Thread.sleep(30 * 1000);
// Send the shutdown request and then wait for finishing
controller.shutdown();
controller.waitUntilFinish();
}
use of edu.uci.ics.crawler4j.robotstxt.RobotstxtServer in project crawler4j by yasserg.
the class StatusHandlerCrawlController method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
logger.info("Needed parameters: ");
logger.info("\t rootFolder (it will contain intermediate crawl data)");
logger.info("\t numberOfCralwers (number of concurrent threads)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
/*
* numberOfCrawlers shows the number of concurrent threads that should
* be initiated for crawling.
*/
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Be polite: Make sure that we don't send more than 1 request per
* second (1000 milliseconds between requests).
*/
config.setPolitenessDelay(1000);
/*
* You can set the maximum crawl depth here. The default value is -1 for
* unlimited depth
*/
config.setMaxDepthOfCrawling(2);
/*
* You can set the maximum number of pages to crawl. The default value
* is -1 for unlimited number of pages
*/
config.setMaxPagesToFetch(1000);
/*
* Do you need to set a proxy? If so, you can use:
* config.setProxyHost("proxyserver.example.com");
* config.setProxyPort(8080);
*
* If your proxy also needs authentication:
* config.setProxyUsername(username); config.getProxyPassword(password);
*/
/*
* This config parameter can be used to set your crawl to be resumable
* (meaning that you can resume the crawl from a previously
* interrupted/crashed crawl). Note: if you enable resuming feature and
* want to start a fresh crawl, you need to delete the contents of
* rootFolder manually.
*/
config.setResumableCrawling(false);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(StatusHandlerCrawler.class, numberOfCrawlers);
}
use of edu.uci.ics.crawler4j.robotstxt.RobotstxtServer in project yyl_example by Relucent.
the class MyControllerTest method main.
public static void main(String[] args) throws Exception {
//#爬虫配置
CrawlConfig config = new CrawlConfig();
//#设置爬取深度
config.setMaxDepthOfCrawling(5);
//#设置页面抓取的最大数量
config.setMaxPagesToFetch(Integer.MAX_VALUE);
//每次请求前等待200毫秒
config.setPolitenessDelay(200);
//爬取数据存储文件夹
config.setCrawlStorageFolder(System.getProperty("user.dir") + "/temp/crawl");
//#设置代理
//config.setProxyHost("proxyserver.example.com");
//config.setProxyPort(8080);
//config.setProxyUsername(username);
//config.getProxyPassword(password);
//#配置恢复停止/崩溃的爬虫
//config.setResumableCrawling(true);
//#实例化控制器
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
//#种子网址
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/~welling/");
controller.addSeed("http://www.ics.uci.edu/");
//爬虫并发数
int numberOfCrawlers = 7;
//#开始爬取网页(阻塞操作)
controller.start(MyCrawler.class, numberOfCrawlers);
}
Aggregations