use of edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException in project crawler4j by yasserg.
the class RobotstxtServer method fetchDirectives.
private HostDirectives fetchDirectives(URL url) {
WebURL robotsTxtUrl = new WebURL();
String host = getHost(url);
String port = ((url.getPort() == url.getDefaultPort()) || (url.getPort() == -1)) ? "" : (":" + url.getPort());
String proto = url.getProtocol();
robotsTxtUrl.setURL(proto + "://" + host + port + "/robots.txt");
HostDirectives directives = null;
PageFetchResult fetchResult = null;
try {
for (int redir = 0; redir < 3; ++redir) {
fetchResult = pageFetcher.fetchPage(robotsTxtUrl);
int status = fetchResult.getStatusCode();
// Follow redirects up to 3 levels
if ((status == HttpStatus.SC_MULTIPLE_CHOICES || status == HttpStatus.SC_MOVED_PERMANENTLY || status == HttpStatus.SC_MOVED_TEMPORARILY || status == HttpStatus.SC_SEE_OTHER || status == HttpStatus.SC_TEMPORARY_REDIRECT || status == 308) && // SC_PERMANENT_REDIRECT RFC7538
fetchResult.getMovedToUrl() != null) {
robotsTxtUrl.setURL(fetchResult.getMovedToUrl());
fetchResult.discardContentIfNotConsumed();
} else {
// Done on all other occasions
break;
}
}
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
Page page = new Page(robotsTxtUrl);
// Most recent answer on robots.txt max size is
// https://goo.gl/OqpKbP
fetchResult.fetchContent(page, 10_000 * 1024);
if (Util.hasPlainTextContent(page.getContentType())) {
String content;
if (page.getContentCharset() == null) {
content = new String(page.getContentData());
} else {
content = new String(page.getContentData(), page.getContentCharset());
}
directives = RobotstxtParser.parse(content, config);
} else if (page.getContentType().contains("html")) {
// TODO This one should be upgraded to remove all
// html tags
String content = new String(page.getContentData());
directives = RobotstxtParser.parse(content, config);
} else {
logger.warn("Can't read this robots.txt: {} as it is not written in plain text, " + "contentType: {}", robotsTxtUrl.getURL(), page.getContentType());
}
} else {
logger.debug("Can't read this robots.txt: {} as it's status code is {}", robotsTxtUrl.getURL(), fetchResult.getStatusCode());
}
} catch (SocketException | UnknownHostException | SocketTimeoutException | NoHttpResponseException se) {
// No logging here, as it just means that robots.txt doesn't exist on this server
// which is perfectly ok
logger.trace("robots.txt probably does not exist.", se);
} catch (PageBiggerThanMaxSizeException pbtms) {
logger.error("Error occurred while fetching (robots) url: {}, {}", robotsTxtUrl.getURL(), pbtms.getMessage());
} catch (Exception e) {
logger.error("Error occurred while fetching (robots) url: " + robotsTxtUrl.getURL(), e);
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
if (directives == null) {
// We still need to have this object to keep track of the time we fetched it
directives = new HostDirectives(config);
}
synchronized (host2directivesCache) {
if (host2directivesCache.size() == config.getCacheSize()) {
String minHost = null;
long minAccessTime = Long.MAX_VALUE;
for (Map.Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) {
long entryAccessTime = entry.getValue().getLastAccessTime();
if (entryAccessTime < minAccessTime) {
minAccessTime = entryAccessTime;
minHost = entry.getKey();
}
}
host2directivesCache.remove(minHost);
}
host2directivesCache.put(host, directives);
}
return directives;
}
use of edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException in project crawler4j by yasserg.
the class WebCrawler method processPage.
private void processPage(WebURL curURL) {
PageFetchResult fetchResult = null;
try {
if (curURL == null) {
return;
}
fetchResult = pageFetcher.fetchPage(curURL);
int statusCode = fetchResult.getStatusCode();
handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH));
// Finds the status reason for all known statuses
Page page = new Page(curURL);
page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
page.setStatusCode(statusCode);
if (statusCode < 200 || statusCode > 299) {
// Not 2XX: 2XX status codes indicate success
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) {
// is 3xx todo
// follow https://issues.apache.org/jira/browse/HTTPCORE-389
page.setRedirect(true);
String movedToUrl = fetchResult.getMovedToUrl();
if (movedToUrl == null) {
logger.warn("Unexpected error, URL: {} is redirected to NOTHING", curURL);
return;
}
page.setRedirectedToUrl(movedToUrl);
onRedirectedStatusCode(page);
if (myController.getConfig().isFollowRedirects()) {
int newDocId = docIdServer.getDocId(movedToUrl);
if (newDocId > 0) {
logger.debug("Redirect page: {} is already seen", curURL);
return;
}
WebURL webURL = new WebURL();
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
webURL.setAnchor(curURL.getAnchor());
if (shouldVisit(page, webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
frontier.schedule(webURL);
} else {
logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL());
}
} else {
logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
}
}
} else {
// All other http codes other than 3xx & 200
String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), // Finds
Locale.ENGLISH);
// the status reason for all known statuses
String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType() == null ? "" : fetchResult.getEntity().getContentType().getValue();
onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
}
} else {
// if status code is 200
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
logger.debug("Redirect page: {} has already been seen", curURL);
return;
}
curURL.setURL(fetchResult.getFetchedUrl());
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
}
if (!fetchResult.fetchContent(page, myController.getConfig().getMaxDownloadSize())) {
throw new ContentFetchException();
}
if (page.isTruncated()) {
logger.warn("Warning: unknown page size exceeded max-download-size, truncated to: " + "({}), at URL: {}", myController.getConfig().getMaxDownloadSize(), curURL.getURL());
}
parser.parse(page, curURL.getURL());
if (shouldFollowLinksIn(page.getWebURL())) {
ParseData parseData = page.getParseData();
List<WebURL> toSchedule = new ArrayList<>();
int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
for (WebURL webURL : parseData.getOutgoingUrls()) {
webURL.setParentDocid(curURL.getDocid());
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
if (newdocid > 0) {
// This is not the first time that this Url is visited. So, we set the
// depth to a negative number.
webURL.setDepth((short) -1);
webURL.setDocid(newdocid);
} else {
webURL.setDocid(-1);
webURL.setDepth((short) (curURL.getDepth() + 1));
if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
toSchedule.add(webURL);
} else {
logger.debug("Not visiting: {} as per the server's \"robots.txt\" " + "policy", webURL.getURL());
}
} else {
logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
}
}
}
}
frontier.scheduleAll(toSchedule);
} else {
logger.debug("Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy", page.getWebURL().getURL());
}
visit(page);
}
} catch (PageBiggerThanMaxSizeException e) {
onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
} catch (ParseException pe) {
onParseError(curURL);
} catch (ContentFetchException cfe) {
onContentFetchError(curURL);
} catch (NotAllowedContentException nace) {
logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL());
} catch (Exception e) {
onUnhandledException(curURL, e);
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
}
use of edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException in project crawler4j by yasserg.
the class PageFetcher method fetchPage.
public PageFetchResult fetchPage(WebURL webUrl) throws InterruptedException, IOException, PageBiggerThanMaxSizeException {
// Getting URL, setting headers & content
PageFetchResult fetchResult = new PageFetchResult();
String toFetchURL = webUrl.getURL();
HttpUriRequest request = null;
try {
request = newHttpUriRequest(toFetchURL);
// Applying Politeness delay
synchronized (mutex) {
long now = (new Date()).getTime();
if ((now - lastFetchTime) < config.getPolitenessDelay()) {
Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
}
lastFetchTime = (new Date()).getTime();
}
CloseableHttpResponse response = httpClient.execute(request);
fetchResult.setEntity(response.getEntity());
fetchResult.setResponseHeaders(response.getAllHeaders());
// Setting HttpStatus
int statusCode = response.getStatusLine().getStatusCode();
// If Redirect ( 3xx )
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) {
// todo follow
// https://issues.apache.org/jira/browse/HTTPCORE-389
Header header = response.getFirstHeader("Location");
if (header != null) {
String movedToUrl = URLCanonicalizer.getCanonicalURL(header.getValue(), toFetchURL);
fetchResult.setMovedToUrl(movedToUrl);
}
} else if (statusCode >= 200 && statusCode <= 299) {
// is 2XX, everything looks ok
fetchResult.setFetchedUrl(toFetchURL);
String uri = request.getURI().toString();
if (!uri.equals(toFetchURL)) {
if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
fetchResult.setFetchedUrl(uri);
}
}
// Checking maximum size
if (fetchResult.getEntity() != null) {
long size = fetchResult.getEntity().getContentLength();
if (size == -1) {
Header length = response.getLastHeader("Content-Length");
if (length == null) {
length = response.getLastHeader("Content-length");
}
if (length != null) {
size = Integer.parseInt(length.getValue());
}
}
if (size > config.getMaxDownloadSize()) {
//fix issue #52 - consume entity
response.close();
throw new PageBiggerThanMaxSizeException(size);
}
}
}
fetchResult.setStatusCode(statusCode);
return fetchResult;
} finally {
// occurs also with thrown exceptions
if ((fetchResult.getEntity() == null) && (request != null)) {
request.abort();
}
}
}
Aggregations