use of edu.uci.ics.crawler4j.fetcher.PageFetcher in project crawler4j by yasserg.
the class BasicCrawlController method main.
public static void main(String[] args) throws Exception {
CrawlConfig config = new CrawlConfig();
// Set the folder where intermediate crawl data is stored (e.g. list of urls that are extracted from previously
// fetched pages and need to be crawled later).
config.setCrawlStorageFolder("/tmp/crawler4j/");
// Be polite: Make sure that we don't send more than 1 request per second (1000 milliseconds between requests).
// Otherwise it may overload the target servers.
config.setPolitenessDelay(1000);
// You can set the maximum crawl depth here. The default value is -1 for unlimited depth.
config.setMaxDepthOfCrawling(2);
// You can set the maximum number of pages to crawl. The default value is -1 for unlimited number of pages.
config.setMaxPagesToFetch(1000);
// Should binary data should also be crawled? example: the contents of pdf, or the metadata of images etc
config.setIncludeBinaryContentInCrawling(false);
// Do you need to set a proxy? If so, you can use:
// config.setProxyHost("proxyserver.example.com");
// config.setProxyPort(8080);
// If your proxy also needs authentication:
// config.setProxyUsername(username); config.getProxyPassword(password);
// This config parameter can be used to set your crawl to be resumable
// (meaning that you can resume the crawl from a previously
// interrupted/crashed crawl). Note: if you enable resuming feature and
// want to start a fresh crawl, you need to delete the contents of
// rootFolder manually.
config.setResumableCrawling(false);
// Set this to true if you want crawling to stop whenever an unexpected error
// occurs. You'll probably want this set to true when you first start testing
// your crawler, and then set to false once you're ready to let the crawler run
// for a long time.
config.setHaltOnError(true);
// Instantiate the controller for this crawl.
PageFetcher pageFetcher = new PageFetcher(config);
RobotstxtConfig robotstxtConfig = new RobotstxtConfig();
RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher);
CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer);
// For each crawl, you need to add some seed urls. These are the first
// URLs that are fetched and then the crawler starts following links
// which are found in these pages
controller.addSeed("https://www.ics.uci.edu/");
controller.addSeed("https://www.ics.uci.edu/~lopes/");
controller.addSeed("https://www.ics.uci.edu/~welling/");
// Number of threads to use during crawling. Increasing this typically makes crawling faster. But crawling
// speed depends on many other factors as well. You can experiment with this to figure out what number of
// threads works best for you.
int numberOfCrawlers = 8;
// To demonstrate an example of how you can pass objects to crawlers, we use an AtomicInteger that crawlers
// increment whenever they see a url which points to an image.
AtomicInteger numSeenImages = new AtomicInteger();
// The factory which creates instances of crawlers.
CrawlController.WebCrawlerFactory<BasicCrawler> factory = () -> new BasicCrawler(numSeenImages);
// Start the crawl. This is a blocking operation, meaning that your code
// will reach the line after this only when crawling is finished.
controller.start(factory, numberOfCrawlers);
}
use of edu.uci.ics.crawler4j.fetcher.PageFetcher in project crawler4j by yasserg.
the class PageFetcherHtmlTest method testCustomPageFetcher.
@Test
public void testCustomPageFetcher() throws Exception {
WireMock.stubFor(WireMock.head(WireMock.urlEqualTo("/some/index.html")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "text/html")));
WireMock.stubFor(WireMock.get(WireMock.urlEqualTo("/some/index.html")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "text/html").withHeader("Content-Length", "47").withBody("<html><body><h1>this is " + "html</h1></body></html>")));
WireMock.stubFor(WireMock.get(WireMock.urlEqualTo("/some/invoice.pdf")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "application/pdf").withBody(new byte[] { 1, 2, 3, 4 })));
WireMock.stubFor(WireMock.head(WireMock.urlEqualTo("/some/invoice.pdf")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "application/pdf")));
CrawlConfig cfg = new CrawlConfig();
WebURL url = new WebURL();
url.setURL("http://localhost:" + wireMockRule.port() + "/some/index.html");
PageFetcher pf = new PageFetcherHtmlOnly(cfg);
pf.fetchPage(url).fetchContent(new Page(url), 47);
WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/some/index.html")));
WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/some/index.html")));
url.setURL("http://localhost:" + wireMockRule.port() + "/some/invoice.pdf");
pf = new PageFetcherHtmlOnly(cfg);
pf.fetchPage(url).fetchContent(new Page(url), 4);
WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/some/invoice.pdf")));
WireMock.verify(0, WireMock.getRequestedFor(WireMock.urlEqualTo("/some/invoice.pdf")));
}
Aggregations