Search in sources :

Example 1 with PageFetchResult

use of edu.uci.ics.crawler4j.fetcher.PageFetchResult in project crawler4j by yasserg.

the class RobotstxtServer method fetchDirectives.

private HostDirectives fetchDirectives(URL url) {
    WebURL robotsTxtUrl = new WebURL();
    String host = getHost(url);
    String port = ((url.getPort() == url.getDefaultPort()) || (url.getPort() == -1)) ? "" : (":" + url.getPort());
    String proto = url.getProtocol();
    robotsTxtUrl.setURL(proto + "://" + host + port + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
        for (int redir = 0; redir < 3; ++redir) {
            fetchResult = pageFetcher.fetchPage(robotsTxtUrl);
            int status = fetchResult.getStatusCode();
            // Follow redirects up to 3 levels
            if ((status == HttpStatus.SC_MULTIPLE_CHOICES || status == HttpStatus.SC_MOVED_PERMANENTLY || status == HttpStatus.SC_MOVED_TEMPORARILY || status == HttpStatus.SC_SEE_OTHER || status == HttpStatus.SC_TEMPORARY_REDIRECT || status == 308) && // SC_PERMANENT_REDIRECT RFC7538
            fetchResult.getMovedToUrl() != null) {
                robotsTxtUrl.setURL(fetchResult.getMovedToUrl());
                fetchResult.discardContentIfNotConsumed();
            } else {
                // Done on all other occasions
                break;
            }
        }
        if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
            Page page = new Page(robotsTxtUrl);
            // Most recent answer on robots.txt max size is
            // https://goo.gl/OqpKbP
            fetchResult.fetchContent(page, 10_000 * 1024);
            if (Util.hasPlainTextContent(page.getContentType())) {
                String content;
                if (page.getContentCharset() == null) {
                    content = new String(page.getContentData());
                } else {
                    content = new String(page.getContentData(), page.getContentCharset());
                }
                directives = RobotstxtParser.parse(content, config);
            } else if (page.getContentType().contains("html")) {
                // TODO This one should be upgraded to remove all
                // html tags
                String content = new String(page.getContentData());
                directives = RobotstxtParser.parse(content, config);
            } else {
                logger.warn("Can't read this robots.txt: {}  as it is not written in plain text, " + "contentType: {}", robotsTxtUrl.getURL(), page.getContentType());
            }
        } else {
            logger.debug("Can't read this robots.txt: {}  as it's status code is {}", robotsTxtUrl.getURL(), fetchResult.getStatusCode());
        }
    } catch (SocketException | UnknownHostException | SocketTimeoutException | NoHttpResponseException se) {
        // No logging here, as it just means that robots.txt doesn't exist on this server
        // which is perfectly ok
        logger.trace("robots.txt probably does not exist.", se);
    } catch (PageBiggerThanMaxSizeException pbtms) {
        logger.error("Error occurred while fetching (robots) url: {}, {}", robotsTxtUrl.getURL(), pbtms.getMessage());
    } catch (Exception e) {
        logger.error("Error occurred while fetching (robots) url: " + robotsTxtUrl.getURL(), e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
    if (directives == null) {
        // We still need to have this object to keep track of the time we fetched it
        directives = new HostDirectives(config);
    }
    synchronized (host2directivesCache) {
        if (host2directivesCache.size() == config.getCacheSize()) {
            String minHost = null;
            long minAccessTime = Long.MAX_VALUE;
            for (Map.Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) {
                long entryAccessTime = entry.getValue().getLastAccessTime();
                if (entryAccessTime < minAccessTime) {
                    minAccessTime = entryAccessTime;
                    minHost = entry.getKey();
                }
            }
            host2directivesCache.remove(minHost);
        }
        host2directivesCache.put(host, directives);
    }
    return directives;
}
Also used : NoHttpResponseException(org.apache.http.NoHttpResponseException) SocketException(java.net.SocketException) UnknownHostException(java.net.UnknownHostException) WebURL(edu.uci.ics.crawler4j.url.WebURL) Page(edu.uci.ics.crawler4j.crawler.Page) MalformedURLException(java.net.MalformedURLException) NoHttpResponseException(org.apache.http.NoHttpResponseException) UnknownHostException(java.net.UnknownHostException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) SocketException(java.net.SocketException) SocketTimeoutException(java.net.SocketTimeoutException) SocketTimeoutException(java.net.SocketTimeoutException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) PageFetchResult(edu.uci.ics.crawler4j.fetcher.PageFetchResult) HashMap(java.util.HashMap) Map(java.util.Map)

Example 2 with PageFetchResult

use of edu.uci.ics.crawler4j.fetcher.PageFetchResult in project crawler4j by yasserg.

the class WebCrawler method processPage.

private void processPage(WebURL curURL) {
    PageFetchResult fetchResult = null;
    try {
        if (curURL == null) {
            return;
        }
        fetchResult = pageFetcher.fetchPage(curURL);
        int statusCode = fetchResult.getStatusCode();
        handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH));
        // Finds the status reason for all known statuses
        Page page = new Page(curURL);
        page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
        page.setStatusCode(statusCode);
        if (statusCode < 200 || statusCode > 299) {
            // Not 2XX: 2XX status codes indicate success
            if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) {
                // is 3xx  todo
                // follow https://issues.apache.org/jira/browse/HTTPCORE-389
                page.setRedirect(true);
                String movedToUrl = fetchResult.getMovedToUrl();
                if (movedToUrl == null) {
                    logger.warn("Unexpected error, URL: {} is redirected to NOTHING", curURL);
                    return;
                }
                page.setRedirectedToUrl(movedToUrl);
                onRedirectedStatusCode(page);
                if (myController.getConfig().isFollowRedirects()) {
                    int newDocId = docIdServer.getDocId(movedToUrl);
                    if (newDocId > 0) {
                        logger.debug("Redirect page: {} is already seen", curURL);
                        return;
                    }
                    WebURL webURL = new WebURL();
                    webURL.setURL(movedToUrl);
                    webURL.setParentDocid(curURL.getParentDocid());
                    webURL.setParentUrl(curURL.getParentUrl());
                    webURL.setDepth(curURL.getDepth());
                    webURL.setDocid(-1);
                    webURL.setAnchor(curURL.getAnchor());
                    if (shouldVisit(page, webURL)) {
                        if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
                            webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                            frontier.schedule(webURL);
                        } else {
                            logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL());
                        }
                    } else {
                        logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                    }
                }
            } else {
                // All other http codes other than 3xx & 200
                String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), // Finds
                Locale.ENGLISH);
                // the status reason for all known statuses
                String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType() == null ? "" : fetchResult.getEntity().getContentType().getValue();
                onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
            }
        } else {
            // if status code is 200
            if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
                if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
                    logger.debug("Redirect page: {} has already been seen", curURL);
                    return;
                }
                curURL.setURL(fetchResult.getFetchedUrl());
                curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
            }
            if (!fetchResult.fetchContent(page, myController.getConfig().getMaxDownloadSize())) {
                throw new ContentFetchException();
            }
            if (page.isTruncated()) {
                logger.warn("Warning: unknown page size exceeded max-download-size, truncated to: " + "({}), at URL: {}", myController.getConfig().getMaxDownloadSize(), curURL.getURL());
            }
            parser.parse(page, curURL.getURL());
            if (shouldFollowLinksIn(page.getWebURL())) {
                ParseData parseData = page.getParseData();
                List<WebURL> toSchedule = new ArrayList<>();
                int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
                for (WebURL webURL : parseData.getOutgoingUrls()) {
                    webURL.setParentDocid(curURL.getDocid());
                    webURL.setParentUrl(curURL.getURL());
                    int newdocid = docIdServer.getDocId(webURL.getURL());
                    if (newdocid > 0) {
                        // This is not the first time that this Url is visited. So, we set the
                        // depth to a negative number.
                        webURL.setDepth((short) -1);
                        webURL.setDocid(newdocid);
                    } else {
                        webURL.setDocid(-1);
                        webURL.setDepth((short) (curURL.getDepth() + 1));
                        if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
                            if (shouldVisit(page, webURL)) {
                                if (robotstxtServer.allows(webURL)) {
                                    webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                                    toSchedule.add(webURL);
                                } else {
                                    logger.debug("Not visiting: {} as per the server's \"robots.txt\" " + "policy", webURL.getURL());
                                }
                            } else {
                                logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                            }
                        }
                    }
                }
                frontier.scheduleAll(toSchedule);
            } else {
                logger.debug("Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy", page.getWebURL().getURL());
            }
            visit(page);
        }
    } catch (PageBiggerThanMaxSizeException e) {
        onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
    } catch (ParseException pe) {
        onParseError(curURL);
    } catch (ContentFetchException cfe) {
        onContentFetchError(curURL);
    } catch (NotAllowedContentException nace) {
        logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL());
    } catch (Exception e) {
        onUnhandledException(curURL, e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
}
Also used : NotAllowedContentException(edu.uci.ics.crawler4j.parser.NotAllowedContentException) ArrayList(java.util.ArrayList) WebURL(edu.uci.ics.crawler4j.url.WebURL) NotAllowedContentException(edu.uci.ics.crawler4j.parser.NotAllowedContentException) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) ContentFetchException(edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException) ContentFetchException(edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) PageFetchResult(edu.uci.ics.crawler4j.fetcher.PageFetchResult) ParseData(edu.uci.ics.crawler4j.parser.ParseData) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException)

Example 3 with PageFetchResult

use of edu.uci.ics.crawler4j.fetcher.PageFetchResult in project crawler4j by yasserg.

the class Downloader method download.

private Page download(String url) {
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    PageFetchResult fetchResult = null;
    try {
        fetchResult = pageFetcher.fetchPage(curURL);
        if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
            Page page = new Page(curURL);
            fetchResult.fetchContent(page, pageFetcher.getConfig().getMaxDownloadSize());
            parser.parse(page, curURL.getURL());
            return page;
        }
    } catch (Exception e) {
        logger.error("Error occurred while fetching url: " + curURL.getURL(), e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
    return null;
}
Also used : PageFetchResult(edu.uci.ics.crawler4j.fetcher.PageFetchResult) WebURL(edu.uci.ics.crawler4j.url.WebURL) Page(edu.uci.ics.crawler4j.crawler.Page)

Example 4 with PageFetchResult

use of edu.uci.ics.crawler4j.fetcher.PageFetchResult in project crawler4j by yasserg.

the class PageFetcherHtmlOnly method fetchPage.

@Override
public PageFetchResult fetchPage(WebURL webUrl) throws InterruptedException, IOException, PageBiggerThanMaxSizeException {
    String toFetchURL = webUrl.getURL();
    PageFetchResult fetchResult = new PageFetchResult();
    HttpHead head = null;
    try {
        head = new HttpHead(toFetchURL);
        synchronized (mutex) {
            long now = new Date().getTime();
            if (now - this.lastFetchTime < this.config.getPolitenessDelay()) {
                Thread.sleep(this.config.getPolitenessDelay() - (now - this.lastFetchTime));
            }
            this.lastFetchTime = new Date().getTime();
        }
        HttpResponse response = httpClient.execute(head);
        fetchResult.setEntity(response.getEntity());
        fetchResult.setResponseHeaders(response.getAllHeaders());
        fetchResult.setFetchedUrl(toFetchURL);
        fetchResult.setStatusCode(response.getStatusLine().getStatusCode());
        String contentType = response.containsHeader("Content-Type") ? response.getFirstHeader("Content-Type").getValue() : null;
        String typeStr = (contentType != null) ? contentType.toLowerCase() : "";
        if (typeStr.equals("") || (typeStr.contains("text") && typeStr.contains("html"))) {
            return super.fetchPage(webUrl);
        } else {
            return fetchResult;
        }
    } finally {
        if (head != null) {
            head.abort();
        }
    }
}
Also used : PageFetchResult(edu.uci.ics.crawler4j.fetcher.PageFetchResult) HttpResponse(org.apache.http.HttpResponse) HttpHead(org.apache.http.client.methods.HttpHead) Date(java.util.Date)

Aggregations

PageFetchResult (edu.uci.ics.crawler4j.fetcher.PageFetchResult)4 WebURL (edu.uci.ics.crawler4j.url.WebURL)3 Page (edu.uci.ics.crawler4j.crawler.Page)2 PageBiggerThanMaxSizeException (edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException)2 ContentFetchException (edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException)1 ParseException (edu.uci.ics.crawler4j.crawler.exceptions.ParseException)1 NotAllowedContentException (edu.uci.ics.crawler4j.parser.NotAllowedContentException)1 ParseData (edu.uci.ics.crawler4j.parser.ParseData)1 MalformedURLException (java.net.MalformedURLException)1 SocketException (java.net.SocketException)1 SocketTimeoutException (java.net.SocketTimeoutException)1 UnknownHostException (java.net.UnknownHostException)1 ArrayList (java.util.ArrayList)1 Date (java.util.Date)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 HttpResponse (org.apache.http.HttpResponse)1 NoHttpResponseException (org.apache.http.NoHttpResponseException)1 HttpHead (org.apache.http.client.methods.HttpHead)1