use of edu.uci.ics.crawler4j.crawler.Page in project crawler4j by yasserg.
the class PageFetcherHtmlTest method testCustomPageFetcher.
@Test
public void testCustomPageFetcher() throws InterruptedException, PageBiggerThanMaxSizeException, IOException {
WireMock.stubFor(WireMock.head(WireMock.urlEqualTo("/some/index.html")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "text/html")));
WireMock.stubFor(WireMock.get(WireMock.urlEqualTo("/some/index.html")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "text/html").withHeader("Content-Length", "47").withBody("<html><body><h1>this is " + "html</h1></body></html>")));
WireMock.stubFor(WireMock.get(WireMock.urlEqualTo("/some/invoice.pdf")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "application/pdf").withBody(new byte[] { 1, 2, 3, 4 })));
WireMock.stubFor(WireMock.head(WireMock.urlEqualTo("/some/invoice.pdf")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "application/pdf")));
CrawlConfig cfg = new CrawlConfig();
WebURL url = new WebURL();
url.setURL("http://localhost:8080/some/index.html");
PageFetcher pf = new PageFetcherHtmlOnly(cfg);
pf.fetchPage(url).fetchContent(new Page(url), 47);
WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/some/index.html")));
WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/some/index.html")));
url.setURL("http://localhost:8080/some/invoice.pdf");
pf = new PageFetcherHtmlOnly(cfg);
pf.fetchPage(url).fetchContent(new Page(url), 4);
WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/some/invoice.pdf")));
WireMock.verify(0, WireMock.getRequestedFor(WireMock.urlEqualTo("/some/invoice.pdf")));
}
use of edu.uci.ics.crawler4j.crawler.Page in project crawler4j by yasserg.
the class Downloader method download.
private Page download(String url) {
WebURL curURL = new WebURL();
curURL.setURL(url);
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchPage(curURL);
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
Page page = new Page(curURL);
fetchResult.fetchContent(page, config.getMaxDownloadSize());
parser.parse(page, curURL.getURL());
return page;
}
} catch (Exception e) {
logger.error("Error occurred while fetching url: " + curURL.getURL(), e);
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
return null;
}
use of edu.uci.ics.crawler4j.crawler.Page in project crawler4j by yasserg.
the class RobotstxtServer method fetchDirectives.
private HostDirectives fetchDirectives(URL url) throws IOException, InterruptedException {
WebURL robotsTxtUrl = new WebURL();
String host = getHost(url);
String port = ((url.getPort() == url.getDefaultPort()) || (url.getPort() == -1)) ? "" : (":" + url.getPort());
String proto = url.getProtocol();
robotsTxtUrl.setURL(proto + "://" + host + port + "/robots.txt");
HostDirectives directives = null;
PageFetchResult fetchResult = null;
try {
for (int redir = 0; redir < 3; ++redir) {
fetchResult = pageFetcher.fetchPage(robotsTxtUrl);
int status = fetchResult.getStatusCode();
// Follow redirects up to 3 levels
if ((status == HttpStatus.SC_MULTIPLE_CHOICES || status == HttpStatus.SC_MOVED_PERMANENTLY || status == HttpStatus.SC_MOVED_TEMPORARILY || status == HttpStatus.SC_SEE_OTHER || status == HttpStatus.SC_TEMPORARY_REDIRECT || status == 308) && // SC_PERMANENT_REDIRECT RFC7538
fetchResult.getMovedToUrl() != null) {
robotsTxtUrl.setURL(fetchResult.getMovedToUrl());
fetchResult.discardContentIfNotConsumed();
} else {
// Done on all other occasions
break;
}
}
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
Page page = new Page(robotsTxtUrl);
// Most recent answer on robots.txt max size is
// https://developers.google.com/search/reference/robots_txt
fetchResult.fetchContent(page, 500 * 1024);
if (Util.hasPlainTextContent(page.getContentType())) {
String content;
if (page.getContentCharset() == null) {
content = new String(page.getContentData());
} else {
content = new String(page.getContentData(), page.getContentCharset());
}
directives = RobotstxtParser.parse(content, config);
} else if (page.getContentType().contains("html")) {
// TODO This one should be upgraded to remove all
// html tags
String content = new String(page.getContentData());
directives = RobotstxtParser.parse(content, config);
} else {
logger.warn("Can't read this robots.txt: {} as it is not written in plain text, " + "contentType: {}", robotsTxtUrl.getURL(), page.getContentType());
}
} else {
logger.debug("Can't read this robots.txt: {} as it's status code is {}", robotsTxtUrl.getURL(), fetchResult.getStatusCode());
}
} catch (SocketException | UnknownHostException | SocketTimeoutException | NoHttpResponseException se) {
// No logging here, as it just means that robots.txt doesn't exist on this server
// which is perfectly ok
logger.trace("robots.txt probably does not exist.", se);
} catch (PageBiggerThanMaxSizeException pbtms) {
logger.error("Error occurred while fetching (robots) url: {}, {}", robotsTxtUrl.getURL(), pbtms.getMessage());
} catch (IOException e) {
logger.error("Error occurred while fetching (robots) url: " + robotsTxtUrl.getURL(), e);
} catch (InterruptedException | RuntimeException e) {
if (crawlConfig.isHaltOnError()) {
throw e;
} else {
logger.error("Error occurred while fetching (robots) url: " + robotsTxtUrl.getURL(), e);
}
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
if (directives == null) {
// We still need to have this object to keep track of the time we fetched it
directives = new HostDirectives(config);
}
synchronized (host2directivesCache) {
if (host2directivesCache.size() == config.getCacheSize()) {
String minHost = null;
long minAccessTime = Long.MAX_VALUE;
for (Map.Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) {
long entryAccessTime = entry.getValue().getLastAccessTime();
if (entryAccessTime < minAccessTime) {
minAccessTime = entryAccessTime;
minHost = entry.getKey();
}
}
host2directivesCache.remove(minHost);
}
host2directivesCache.put(host, directives);
}
return directives;
}
use of edu.uci.ics.crawler4j.crawler.Page in project crawler4j by yasserg.
the class Downloader method processUrl.
public void processUrl(String url) {
logger.debug("Processing: {}", url);
Page page = download(url);
if (page != null) {
ParseData parseData = page.getParseData();
if (parseData != null) {
if (parseData instanceof HtmlParseData) {
HtmlParseData htmlParseData = (HtmlParseData) parseData;
logger.debug("Title: {}", htmlParseData.getTitle());
logger.debug("Text length: {}", htmlParseData.getText().length());
logger.debug("Html length: {}", htmlParseData.getHtml().length());
}
} else {
logger.warn("Couldn't parse the content of the page.");
}
} else {
logger.warn("Couldn't fetch the content of the page.");
}
logger.debug("==============");
}
use of edu.uci.ics.crawler4j.crawler.Page in project crawler4j by yasserg.
the class PageFetcherHtmlTest method testCustomPageFetcher.
@Test
public void testCustomPageFetcher() throws Exception {
WireMock.stubFor(WireMock.head(WireMock.urlEqualTo("/some/index.html")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "text/html")));
WireMock.stubFor(WireMock.get(WireMock.urlEqualTo("/some/index.html")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "text/html").withHeader("Content-Length", "47").withBody("<html><body><h1>this is " + "html</h1></body></html>")));
WireMock.stubFor(WireMock.get(WireMock.urlEqualTo("/some/invoice.pdf")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "application/pdf").withBody(new byte[] { 1, 2, 3, 4 })));
WireMock.stubFor(WireMock.head(WireMock.urlEqualTo("/some/invoice.pdf")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "application/pdf")));
CrawlConfig cfg = new CrawlConfig();
WebURL url = new WebURL();
url.setURL("http://localhost:" + wireMockRule.port() + "/some/index.html");
PageFetcher pf = new PageFetcherHtmlOnly(cfg);
pf.fetchPage(url).fetchContent(new Page(url), 47);
WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/some/index.html")));
WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/some/index.html")));
url.setURL("http://localhost:" + wireMockRule.port() + "/some/invoice.pdf");
pf = new PageFetcherHtmlOnly(cfg);
pf.fetchPage(url).fetchContent(new Page(url), 4);
WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/some/invoice.pdf")));
WireMock.verify(0, WireMock.getRequestedFor(WireMock.urlEqualTo("/some/invoice.pdf")));
}
Aggregations