use of edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig in project mastering-java by Kingminghuang.
the class Controller method main.
public static void main(String[] args) {
String crawlStorageFolder = "D:\\crawl\\data";
int numberOfCrawlers = 7;
CrawlConfig crawlConfig = new CrawlConfig();
crawlConfig.setCrawlStorageFolder(crawlStorageFolder);
crawlConfig.setIncludeHttpsPages(true);
crawlConfig.setProxyHost("cn-proxy.jp.oracle.com");
crawlConfig.setProxyPort(80);
crawlConfig.setPolitenessDelay(POLITENESS_DELAY);
crawlConfig.setMaxDepthOfCrawling(MAX_DEPTH_OF_CRAWLING);
PageFetcher pageFetcher = new PageFetcher(crawlConfig);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
try {
CrawlController crawlController = new CrawlController(crawlConfig, pageFetcher, robotstxtServer);
crawlController.addSeed("http://www.ics.uci.edu/~loops/");
crawlController.addSeed("http://www.ics.uci.edu/~welling");
crawlController.addSeed("http://www.ics.uci.edu/");
crawlController.start(Crawler.class, numberOfCrawlers);
} catch (Exception e) {
e.printStackTrace();
}
}
use of edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig in project crawler4j by yasserg.
the class BasicCrawlController method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
logger.info("Needed parameters: ");
logger.info("\t rootFolder (it will contain intermediate crawl data)");
logger.info("\t numberOfCralwers (number of concurrent threads)");
return;
}
/*
* crawlStorageFolder is a folder where intermediate crawl data is
* stored.
*/
String crawlStorageFolder = args[0];
/*
* numberOfCrawlers shows the number of concurrent threads that should
* be initiated for crawling.
*/
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(crawlStorageFolder);
/*
* Be polite: Make sure that we don't send more than 1 request per
* second (1000 milliseconds between requests).
*/
config.setPolitenessDelay(1000);
/*
* You can set the maximum crawl depth here. The default value is -1 for
* unlimited depth
*/
config.setMaxDepthOfCrawling(2);
/*
* You can set the maximum number of pages to crawl. The default value
* is -1 for unlimited number of pages
*/
config.setMaxPagesToFetch(1000);
/**
* Do you want crawler4j to crawl also binary data ?
* example: the contents of pdf, or the metadata of images etc
*/
config.setIncludeBinaryContentInCrawling(false);
/*
* Do you need to set a proxy? If so, you can use:
* config.setProxyHost("proxyserver.example.com");
* config.setProxyPort(8080);
*
* If your proxy also needs authentication:
* config.setProxyUsername(username); config.getProxyPassword(password);
*/
/*
* This config parameter can be used to set your crawl to be resumable
* (meaning that you can resume the crawl from a previously
* interrupted/crashed crawl). Note: if you enable resuming feature and
* want to start a fresh crawl, you need to delete the contents of
* rootFolder manually.
*/
config.setResumableCrawling(false);
/*
* Instantiate the controller for this crawl.
*/
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
/*
* For each crawl, you need to add some seed urls. These are the first
* URLs that are fetched and then the crawler starts following links
* which are found in these pages
*/
controller.addSeed("http://www.ics.uci.edu/");
controller.addSeed("http://www.ics.uci.edu/~lopes/");
controller.addSeed("http://www.ics.uci.edu/~welling/");
/*
* Start the crawl. This is a blocking operation, meaning that your code
* will reach the line after this only when crawling is finished.
*/
controller.start(BasicCrawler.class, numberOfCrawlers);
}
use of edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig in project crawler4j by yasserg.
the class LocalDataCollectorController method main.
public static void main(String[] args) throws Exception {
if (args.length != 2) {
logger.info("Needed parameters: ");
logger.info("\t rootFolder (it will contain intermediate crawl data)");
logger.info("\t numberOfCralwers (number of concurrent threads)");
return;
}
String rootFolder = args[0];
int numberOfCrawlers = Integer.parseInt(args[1]);
CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(rootFolder);
config.setMaxPagesToFetch(10);
config.setPolitenessDelay(1000);
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
controller.addSeed("http://www.ics.uci.edu/");
controller.start(LocalDataCollectorCrawler.class, numberOfCrawlers);
List<Object> crawlersLocalData = controller.getCrawlersLocalData();
long totalLinks = 0;
long totalTextSize = 0;
int totalProcessedPages = 0;
for (Object localData : crawlersLocalData) {
CrawlStat stat = (CrawlStat) localData;
totalLinks += stat.getTotalLinks();
totalTextSize += stat.getTotalTextSize();
totalProcessedPages += stat.getTotalProcessedPages();
}
logger.info("Aggregated Statistics:");
logger.info("\tProcessed Pages: {}", totalProcessedPages);
logger.info("\tTotal Links found: {}", totalLinks);
logger.info("\tTotal Text Size: {}", totalTextSize);
}
use of edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig in project crawler4j by yasserg.
the class RobotstxtParserNonLowercaseUserAgentTest method testParseWithNonLowercaseUserAgent.
@Test
public void testParseWithNonLowercaseUserAgent() {
String userAgent = "testAgent";
String content = "User-agent: " + userAgent + '\n' + "Disallow: /test/path/\n";
final RobotstxtConfig robotsConfig = new RobotstxtConfig();
robotsConfig.setUserAgentName(userAgent);
HostDirectives hostDirectives = RobotstxtParser.parse(content, robotsConfig);
assertNotNull("parsed HostDirectives is null", hostDirectives);
assertFalse("HostDirectives should not allow path: '/test/path/'", hostDirectives.allows("/test/path/"));
}
Aggregations