Search in sources :

Example 1 with PageBiggerThanMaxSizeException

use of edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException in project crawler4j by yasserg.

the class RobotstxtServer method fetchDirectives.

private HostDirectives fetchDirectives(URL url) {
    WebURL robotsTxtUrl = new WebURL();
    String host = getHost(url);
    String port = ((url.getPort() == url.getDefaultPort()) || (url.getPort() == -1)) ? "" : (":" + url.getPort());
    String proto = url.getProtocol();
    robotsTxtUrl.setURL(proto + "://" + host + port + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
        for (int redir = 0; redir < 3; ++redir) {
            fetchResult = pageFetcher.fetchPage(robotsTxtUrl);
            int status = fetchResult.getStatusCode();
            // Follow redirects up to 3 levels
            if ((status == HttpStatus.SC_MULTIPLE_CHOICES || status == HttpStatus.SC_MOVED_PERMANENTLY || status == HttpStatus.SC_MOVED_TEMPORARILY || status == HttpStatus.SC_SEE_OTHER || status == HttpStatus.SC_TEMPORARY_REDIRECT || status == 308) && // SC_PERMANENT_REDIRECT RFC7538
            fetchResult.getMovedToUrl() != null) {
                robotsTxtUrl.setURL(fetchResult.getMovedToUrl());
                fetchResult.discardContentIfNotConsumed();
            } else {
                // Done on all other occasions
                break;
            }
        }
        if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
            Page page = new Page(robotsTxtUrl);
            // Most recent answer on robots.txt max size is
            // https://goo.gl/OqpKbP
            fetchResult.fetchContent(page, 10_000 * 1024);
            if (Util.hasPlainTextContent(page.getContentType())) {
                String content;
                if (page.getContentCharset() == null) {
                    content = new String(page.getContentData());
                } else {
                    content = new String(page.getContentData(), page.getContentCharset());
                }
                directives = RobotstxtParser.parse(content, config);
            } else if (page.getContentType().contains("html")) {
                // TODO This one should be upgraded to remove all
                // html tags
                String content = new String(page.getContentData());
                directives = RobotstxtParser.parse(content, config);
            } else {
                logger.warn("Can't read this robots.txt: {}  as it is not written in plain text, " + "contentType: {}", robotsTxtUrl.getURL(), page.getContentType());
            }
        } else {
            logger.debug("Can't read this robots.txt: {}  as it's status code is {}", robotsTxtUrl.getURL(), fetchResult.getStatusCode());
        }
    } catch (SocketException | UnknownHostException | SocketTimeoutException | NoHttpResponseException se) {
        // No logging here, as it just means that robots.txt doesn't exist on this server
        // which is perfectly ok
        logger.trace("robots.txt probably does not exist.", se);
    } catch (PageBiggerThanMaxSizeException pbtms) {
        logger.error("Error occurred while fetching (robots) url: {}, {}", robotsTxtUrl.getURL(), pbtms.getMessage());
    } catch (Exception e) {
        logger.error("Error occurred while fetching (robots) url: " + robotsTxtUrl.getURL(), e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
    if (directives == null) {
        // We still need to have this object to keep track of the time we fetched it
        directives = new HostDirectives(config);
    }
    synchronized (host2directivesCache) {
        if (host2directivesCache.size() == config.getCacheSize()) {
            String minHost = null;
            long minAccessTime = Long.MAX_VALUE;
            for (Map.Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) {
                long entryAccessTime = entry.getValue().getLastAccessTime();
                if (entryAccessTime < minAccessTime) {
                    minAccessTime = entryAccessTime;
                    minHost = entry.getKey();
                }
            }
            host2directivesCache.remove(minHost);
        }
        host2directivesCache.put(host, directives);
    }
    return directives;
}
Also used : NoHttpResponseException(org.apache.http.NoHttpResponseException) SocketException(java.net.SocketException) UnknownHostException(java.net.UnknownHostException) WebURL(edu.uci.ics.crawler4j.url.WebURL) Page(edu.uci.ics.crawler4j.crawler.Page) MalformedURLException(java.net.MalformedURLException) NoHttpResponseException(org.apache.http.NoHttpResponseException) UnknownHostException(java.net.UnknownHostException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) SocketException(java.net.SocketException) SocketTimeoutException(java.net.SocketTimeoutException) SocketTimeoutException(java.net.SocketTimeoutException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) PageFetchResult(edu.uci.ics.crawler4j.fetcher.PageFetchResult) HashMap(java.util.HashMap) Map(java.util.Map)

Example 2 with PageBiggerThanMaxSizeException

use of edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException in project crawler4j by yasserg.

the class WebCrawler method processPage.

private void processPage(WebURL curURL) {
    PageFetchResult fetchResult = null;
    try {
        if (curURL == null) {
            return;
        }
        fetchResult = pageFetcher.fetchPage(curURL);
        int statusCode = fetchResult.getStatusCode();
        handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH));
        // Finds the status reason for all known statuses
        Page page = new Page(curURL);
        page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
        page.setStatusCode(statusCode);
        if (statusCode < 200 || statusCode > 299) {
            // Not 2XX: 2XX status codes indicate success
            if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) {
                // is 3xx  todo
                // follow https://issues.apache.org/jira/browse/HTTPCORE-389
                page.setRedirect(true);
                String movedToUrl = fetchResult.getMovedToUrl();
                if (movedToUrl == null) {
                    logger.warn("Unexpected error, URL: {} is redirected to NOTHING", curURL);
                    return;
                }
                page.setRedirectedToUrl(movedToUrl);
                onRedirectedStatusCode(page);
                if (myController.getConfig().isFollowRedirects()) {
                    int newDocId = docIdServer.getDocId(movedToUrl);
                    if (newDocId > 0) {
                        logger.debug("Redirect page: {} is already seen", curURL);
                        return;
                    }
                    WebURL webURL = new WebURL();
                    webURL.setURL(movedToUrl);
                    webURL.setParentDocid(curURL.getParentDocid());
                    webURL.setParentUrl(curURL.getParentUrl());
                    webURL.setDepth(curURL.getDepth());
                    webURL.setDocid(-1);
                    webURL.setAnchor(curURL.getAnchor());
                    if (shouldVisit(page, webURL)) {
                        if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
                            webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                            frontier.schedule(webURL);
                        } else {
                            logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL());
                        }
                    } else {
                        logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                    }
                }
            } else {
                // All other http codes other than 3xx & 200
                String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), // Finds
                Locale.ENGLISH);
                // the status reason for all known statuses
                String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType() == null ? "" : fetchResult.getEntity().getContentType().getValue();
                onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
            }
        } else {
            // if status code is 200
            if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
                if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
                    logger.debug("Redirect page: {} has already been seen", curURL);
                    return;
                }
                curURL.setURL(fetchResult.getFetchedUrl());
                curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
            }
            if (!fetchResult.fetchContent(page, myController.getConfig().getMaxDownloadSize())) {
                throw new ContentFetchException();
            }
            if (page.isTruncated()) {
                logger.warn("Warning: unknown page size exceeded max-download-size, truncated to: " + "({}), at URL: {}", myController.getConfig().getMaxDownloadSize(), curURL.getURL());
            }
            parser.parse(page, curURL.getURL());
            if (shouldFollowLinksIn(page.getWebURL())) {
                ParseData parseData = page.getParseData();
                List<WebURL> toSchedule = new ArrayList<>();
                int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
                for (WebURL webURL : parseData.getOutgoingUrls()) {
                    webURL.setParentDocid(curURL.getDocid());
                    webURL.setParentUrl(curURL.getURL());
                    int newdocid = docIdServer.getDocId(webURL.getURL());
                    if (newdocid > 0) {
                        // This is not the first time that this Url is visited. So, we set the
                        // depth to a negative number.
                        webURL.setDepth((short) -1);
                        webURL.setDocid(newdocid);
                    } else {
                        webURL.setDocid(-1);
                        webURL.setDepth((short) (curURL.getDepth() + 1));
                        if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
                            if (shouldVisit(page, webURL)) {
                                if (robotstxtServer.allows(webURL)) {
                                    webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                                    toSchedule.add(webURL);
                                } else {
                                    logger.debug("Not visiting: {} as per the server's \"robots.txt\" " + "policy", webURL.getURL());
                                }
                            } else {
                                logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                            }
                        }
                    }
                }
                frontier.scheduleAll(toSchedule);
            } else {
                logger.debug("Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy", page.getWebURL().getURL());
            }
            visit(page);
        }
    } catch (PageBiggerThanMaxSizeException e) {
        onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
    } catch (ParseException pe) {
        onParseError(curURL);
    } catch (ContentFetchException cfe) {
        onContentFetchError(curURL);
    } catch (NotAllowedContentException nace) {
        logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL());
    } catch (Exception e) {
        onUnhandledException(curURL, e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
}
Also used : NotAllowedContentException(edu.uci.ics.crawler4j.parser.NotAllowedContentException) ArrayList(java.util.ArrayList) WebURL(edu.uci.ics.crawler4j.url.WebURL) NotAllowedContentException(edu.uci.ics.crawler4j.parser.NotAllowedContentException) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) ContentFetchException(edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException) ContentFetchException(edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) PageFetchResult(edu.uci.ics.crawler4j.fetcher.PageFetchResult) ParseData(edu.uci.ics.crawler4j.parser.ParseData) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException)

Example 3 with PageBiggerThanMaxSizeException

use of edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException in project crawler4j by yasserg.

the class PageFetcher method fetchPage.

public PageFetchResult fetchPage(WebURL webUrl) throws InterruptedException, IOException, PageBiggerThanMaxSizeException {
    // Getting URL, setting headers & content
    PageFetchResult fetchResult = new PageFetchResult();
    String toFetchURL = webUrl.getURL();
    HttpUriRequest request = null;
    try {
        request = newHttpUriRequest(toFetchURL);
        // Applying Politeness delay
        synchronized (mutex) {
            long now = (new Date()).getTime();
            if ((now - lastFetchTime) < config.getPolitenessDelay()) {
                Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
            }
            lastFetchTime = (new Date()).getTime();
        }
        CloseableHttpResponse response = httpClient.execute(request);
        fetchResult.setEntity(response.getEntity());
        fetchResult.setResponseHeaders(response.getAllHeaders());
        // Setting HttpStatus
        int statusCode = response.getStatusLine().getStatusCode();
        // If Redirect ( 3xx )
        if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) {
            // todo follow
            // https://issues.apache.org/jira/browse/HTTPCORE-389
            Header header = response.getFirstHeader("Location");
            if (header != null) {
                String movedToUrl = URLCanonicalizer.getCanonicalURL(header.getValue(), toFetchURL);
                fetchResult.setMovedToUrl(movedToUrl);
            }
        } else if (statusCode >= 200 && statusCode <= 299) {
            // is 2XX, everything looks ok
            fetchResult.setFetchedUrl(toFetchURL);
            String uri = request.getURI().toString();
            if (!uri.equals(toFetchURL)) {
                if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
                    fetchResult.setFetchedUrl(uri);
                }
            }
            // Checking maximum size
            if (fetchResult.getEntity() != null) {
                long size = fetchResult.getEntity().getContentLength();
                if (size == -1) {
                    Header length = response.getLastHeader("Content-Length");
                    if (length == null) {
                        length = response.getLastHeader("Content-length");
                    }
                    if (length != null) {
                        size = Integer.parseInt(length.getValue());
                    }
                }
                if (size > config.getMaxDownloadSize()) {
                    //fix issue #52 - consume entity
                    response.close();
                    throw new PageBiggerThanMaxSizeException(size);
                }
            }
        }
        fetchResult.setStatusCode(statusCode);
        return fetchResult;
    } finally {
        // occurs also with thrown exceptions
        if ((fetchResult.getEntity() == null) && (request != null)) {
            request.abort();
        }
    }
}
Also used : HttpUriRequest(org.apache.http.client.methods.HttpUriRequest) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) Header(org.apache.http.Header) CloseableHttpResponse(org.apache.http.client.methods.CloseableHttpResponse) Date(java.util.Date)

Aggregations

PageBiggerThanMaxSizeException (edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException)3 PageFetchResult (edu.uci.ics.crawler4j.fetcher.PageFetchResult)2 WebURL (edu.uci.ics.crawler4j.url.WebURL)2 Page (edu.uci.ics.crawler4j.crawler.Page)1 ContentFetchException (edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException)1 ParseException (edu.uci.ics.crawler4j.crawler.exceptions.ParseException)1 NotAllowedContentException (edu.uci.ics.crawler4j.parser.NotAllowedContentException)1 ParseData (edu.uci.ics.crawler4j.parser.ParseData)1 MalformedURLException (java.net.MalformedURLException)1 SocketException (java.net.SocketException)1 SocketTimeoutException (java.net.SocketTimeoutException)1 UnknownHostException (java.net.UnknownHostException)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Header (org.apache.http.Header)1 NoHttpResponseException (org.apache.http.NoHttpResponseException)1 CloseableHttpResponse (org.apache.http.client.methods.CloseableHttpResponse)1 HttpUriRequest (org.apache.http.client.methods.HttpUriRequest)1