use of edu.uci.ics.crawler4j.fetcher.PageFetchResult in project crawler4j by yasserg.
the class RobotstxtServer method fetchDirectives.
private HostDirectives fetchDirectives(URL url) {
WebURL robotsTxtUrl = new WebURL();
String host = getHost(url);
String port = ((url.getPort() == url.getDefaultPort()) || (url.getPort() == -1)) ? "" : (":" + url.getPort());
String proto = url.getProtocol();
robotsTxtUrl.setURL(proto + "://" + host + port + "/robots.txt");
HostDirectives directives = null;
PageFetchResult fetchResult = null;
try {
for (int redir = 0; redir < 3; ++redir) {
fetchResult = pageFetcher.fetchPage(robotsTxtUrl);
int status = fetchResult.getStatusCode();
// Follow redirects up to 3 levels
if ((status == HttpStatus.SC_MULTIPLE_CHOICES || status == HttpStatus.SC_MOVED_PERMANENTLY || status == HttpStatus.SC_MOVED_TEMPORARILY || status == HttpStatus.SC_SEE_OTHER || status == HttpStatus.SC_TEMPORARY_REDIRECT || status == 308) && // SC_PERMANENT_REDIRECT RFC7538
fetchResult.getMovedToUrl() != null) {
robotsTxtUrl.setURL(fetchResult.getMovedToUrl());
fetchResult.discardContentIfNotConsumed();
} else {
// Done on all other occasions
break;
}
}
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
Page page = new Page(robotsTxtUrl);
// Most recent answer on robots.txt max size is
// https://goo.gl/OqpKbP
fetchResult.fetchContent(page, 10_000 * 1024);
if (Util.hasPlainTextContent(page.getContentType())) {
String content;
if (page.getContentCharset() == null) {
content = new String(page.getContentData());
} else {
content = new String(page.getContentData(), page.getContentCharset());
}
directives = RobotstxtParser.parse(content, config);
} else if (page.getContentType().contains("html")) {
// TODO This one should be upgraded to remove all
// html tags
String content = new String(page.getContentData());
directives = RobotstxtParser.parse(content, config);
} else {
logger.warn("Can't read this robots.txt: {} as it is not written in plain text, " + "contentType: {}", robotsTxtUrl.getURL(), page.getContentType());
}
} else {
logger.debug("Can't read this robots.txt: {} as it's status code is {}", robotsTxtUrl.getURL(), fetchResult.getStatusCode());
}
} catch (SocketException | UnknownHostException | SocketTimeoutException | NoHttpResponseException se) {
// No logging here, as it just means that robots.txt doesn't exist on this server
// which is perfectly ok
logger.trace("robots.txt probably does not exist.", se);
} catch (PageBiggerThanMaxSizeException pbtms) {
logger.error("Error occurred while fetching (robots) url: {}, {}", robotsTxtUrl.getURL(), pbtms.getMessage());
} catch (Exception e) {
logger.error("Error occurred while fetching (robots) url: " + robotsTxtUrl.getURL(), e);
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
if (directives == null) {
// We still need to have this object to keep track of the time we fetched it
directives = new HostDirectives(config);
}
synchronized (host2directivesCache) {
if (host2directivesCache.size() == config.getCacheSize()) {
String minHost = null;
long minAccessTime = Long.MAX_VALUE;
for (Map.Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) {
long entryAccessTime = entry.getValue().getLastAccessTime();
if (entryAccessTime < minAccessTime) {
minAccessTime = entryAccessTime;
minHost = entry.getKey();
}
}
host2directivesCache.remove(minHost);
}
host2directivesCache.put(host, directives);
}
return directives;
}
use of edu.uci.ics.crawler4j.fetcher.PageFetchResult in project crawler4j by yasserg.
the class WebCrawler method processPage.
private void processPage(WebURL curURL) {
PageFetchResult fetchResult = null;
try {
if (curURL == null) {
return;
}
fetchResult = pageFetcher.fetchPage(curURL);
int statusCode = fetchResult.getStatusCode();
handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH));
// Finds the status reason for all known statuses
Page page = new Page(curURL);
page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
page.setStatusCode(statusCode);
if (statusCode < 200 || statusCode > 299) {
// Not 2XX: 2XX status codes indicate success
if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) {
// is 3xx todo
// follow https://issues.apache.org/jira/browse/HTTPCORE-389
page.setRedirect(true);
String movedToUrl = fetchResult.getMovedToUrl();
if (movedToUrl == null) {
logger.warn("Unexpected error, URL: {} is redirected to NOTHING", curURL);
return;
}
page.setRedirectedToUrl(movedToUrl);
onRedirectedStatusCode(page);
if (myController.getConfig().isFollowRedirects()) {
int newDocId = docIdServer.getDocId(movedToUrl);
if (newDocId > 0) {
logger.debug("Redirect page: {} is already seen", curURL);
return;
}
WebURL webURL = new WebURL();
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
webURL.setAnchor(curURL.getAnchor());
if (shouldVisit(page, webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
frontier.schedule(webURL);
} else {
logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL());
}
} else {
logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
}
}
} else {
// All other http codes other than 3xx & 200
String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), // Finds
Locale.ENGLISH);
// the status reason for all known statuses
String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType() == null ? "" : fetchResult.getEntity().getContentType().getValue();
onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
}
} else {
// if status code is 200
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
logger.debug("Redirect page: {} has already been seen", curURL);
return;
}
curURL.setURL(fetchResult.getFetchedUrl());
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
}
if (!fetchResult.fetchContent(page, myController.getConfig().getMaxDownloadSize())) {
throw new ContentFetchException();
}
if (page.isTruncated()) {
logger.warn("Warning: unknown page size exceeded max-download-size, truncated to: " + "({}), at URL: {}", myController.getConfig().getMaxDownloadSize(), curURL.getURL());
}
parser.parse(page, curURL.getURL());
if (shouldFollowLinksIn(page.getWebURL())) {
ParseData parseData = page.getParseData();
List<WebURL> toSchedule = new ArrayList<>();
int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
for (WebURL webURL : parseData.getOutgoingUrls()) {
webURL.setParentDocid(curURL.getDocid());
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
if (newdocid > 0) {
// This is not the first time that this Url is visited. So, we set the
// depth to a negative number.
webURL.setDepth((short) -1);
webURL.setDocid(newdocid);
} else {
webURL.setDocid(-1);
webURL.setDepth((short) (curURL.getDepth() + 1));
if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
toSchedule.add(webURL);
} else {
logger.debug("Not visiting: {} as per the server's \"robots.txt\" " + "policy", webURL.getURL());
}
} else {
logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
}
}
}
}
frontier.scheduleAll(toSchedule);
} else {
logger.debug("Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy", page.getWebURL().getURL());
}
visit(page);
}
} catch (PageBiggerThanMaxSizeException e) {
onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
} catch (ParseException pe) {
onParseError(curURL);
} catch (ContentFetchException cfe) {
onContentFetchError(curURL);
} catch (NotAllowedContentException nace) {
logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL());
} catch (Exception e) {
onUnhandledException(curURL, e);
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
}
use of edu.uci.ics.crawler4j.fetcher.PageFetchResult in project crawler4j by yasserg.
the class Downloader method download.
private Page download(String url) {
WebURL curURL = new WebURL();
curURL.setURL(url);
PageFetchResult fetchResult = null;
try {
fetchResult = pageFetcher.fetchPage(curURL);
if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
Page page = new Page(curURL);
fetchResult.fetchContent(page, pageFetcher.getConfig().getMaxDownloadSize());
parser.parse(page, curURL.getURL());
return page;
}
} catch (Exception e) {
logger.error("Error occurred while fetching url: " + curURL.getURL(), e);
} finally {
if (fetchResult != null) {
fetchResult.discardContentIfNotConsumed();
}
}
return null;
}
use of edu.uci.ics.crawler4j.fetcher.PageFetchResult in project crawler4j by yasserg.
the class PageFetcherHtmlOnly method fetchPage.
@Override
public PageFetchResult fetchPage(WebURL webUrl) throws InterruptedException, IOException, PageBiggerThanMaxSizeException {
String toFetchURL = webUrl.getURL();
PageFetchResult fetchResult = new PageFetchResult();
HttpHead head = null;
try {
head = new HttpHead(toFetchURL);
synchronized (mutex) {
long now = new Date().getTime();
if (now - this.lastFetchTime < this.config.getPolitenessDelay()) {
Thread.sleep(this.config.getPolitenessDelay() - (now - this.lastFetchTime));
}
this.lastFetchTime = new Date().getTime();
}
HttpResponse response = httpClient.execute(head);
fetchResult.setEntity(response.getEntity());
fetchResult.setResponseHeaders(response.getAllHeaders());
fetchResult.setFetchedUrl(toFetchURL);
fetchResult.setStatusCode(response.getStatusLine().getStatusCode());
String contentType = response.containsHeader("Content-Type") ? response.getFirstHeader("Content-Type").getValue() : null;
String typeStr = (contentType != null) ? contentType.toLowerCase() : "";
if (typeStr.equals("") || (typeStr.contains("text") && typeStr.contains("html"))) {
return super.fetchPage(webUrl);
} else {
return fetchResult;
}
} finally {
if (head != null) {
head.abort();
}
}
}
Aggregations