Search in sources :

Example 1 with ParseData

use of edu.uci.ics.crawler4j.parser.ParseData in project crawler4j by yasserg.

the class WebCrawler method processPage.

private void processPage(WebURL curURL) {
    PageFetchResult fetchResult = null;
    try {
        if (curURL == null) {
            return;
        }
        fetchResult = pageFetcher.fetchPage(curURL);
        int statusCode = fetchResult.getStatusCode();
        handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH));
        // Finds the status reason for all known statuses
        Page page = new Page(curURL);
        page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
        page.setStatusCode(statusCode);
        if (statusCode < 200 || statusCode > 299) {
            // Not 2XX: 2XX status codes indicate success
            if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) {
                // is 3xx  todo
                // follow https://issues.apache.org/jira/browse/HTTPCORE-389
                page.setRedirect(true);
                String movedToUrl = fetchResult.getMovedToUrl();
                if (movedToUrl == null) {
                    logger.warn("Unexpected error, URL: {} is redirected to NOTHING", curURL);
                    return;
                }
                page.setRedirectedToUrl(movedToUrl);
                onRedirectedStatusCode(page);
                if (myController.getConfig().isFollowRedirects()) {
                    int newDocId = docIdServer.getDocId(movedToUrl);
                    if (newDocId > 0) {
                        logger.debug("Redirect page: {} is already seen", curURL);
                        return;
                    }
                    WebURL webURL = new WebURL();
                    webURL.setURL(movedToUrl);
                    webURL.setParentDocid(curURL.getParentDocid());
                    webURL.setParentUrl(curURL.getParentUrl());
                    webURL.setDepth(curURL.getDepth());
                    webURL.setDocid(-1);
                    webURL.setAnchor(curURL.getAnchor());
                    if (shouldVisit(page, webURL)) {
                        if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
                            webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                            frontier.schedule(webURL);
                        } else {
                            logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL());
                        }
                    } else {
                        logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                    }
                }
            } else {
                // All other http codes other than 3xx & 200
                String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), // Finds
                Locale.ENGLISH);
                // the status reason for all known statuses
                String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType() == null ? "" : fetchResult.getEntity().getContentType().getValue();
                onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
            }
        } else {
            // if status code is 200
            if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
                if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
                    logger.debug("Redirect page: {} has already been seen", curURL);
                    return;
                }
                curURL.setURL(fetchResult.getFetchedUrl());
                curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
            }
            if (!fetchResult.fetchContent(page, myController.getConfig().getMaxDownloadSize())) {
                throw new ContentFetchException();
            }
            if (page.isTruncated()) {
                logger.warn("Warning: unknown page size exceeded max-download-size, truncated to: " + "({}), at URL: {}", myController.getConfig().getMaxDownloadSize(), curURL.getURL());
            }
            parser.parse(page, curURL.getURL());
            if (shouldFollowLinksIn(page.getWebURL())) {
                ParseData parseData = page.getParseData();
                List<WebURL> toSchedule = new ArrayList<>();
                int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
                for (WebURL webURL : parseData.getOutgoingUrls()) {
                    webURL.setParentDocid(curURL.getDocid());
                    webURL.setParentUrl(curURL.getURL());
                    int newdocid = docIdServer.getDocId(webURL.getURL());
                    if (newdocid > 0) {
                        // This is not the first time that this Url is visited. So, we set the
                        // depth to a negative number.
                        webURL.setDepth((short) -1);
                        webURL.setDocid(newdocid);
                    } else {
                        webURL.setDocid(-1);
                        webURL.setDepth((short) (curURL.getDepth() + 1));
                        if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
                            if (shouldVisit(page, webURL)) {
                                if (robotstxtServer.allows(webURL)) {
                                    webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                                    toSchedule.add(webURL);
                                } else {
                                    logger.debug("Not visiting: {} as per the server's \"robots.txt\" " + "policy", webURL.getURL());
                                }
                            } else {
                                logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                            }
                        }
                    }
                }
                frontier.scheduleAll(toSchedule);
            } else {
                logger.debug("Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy", page.getWebURL().getURL());
            }
            visit(page);
        }
    } catch (PageBiggerThanMaxSizeException e) {
        onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
    } catch (ParseException pe) {
        onParseError(curURL);
    } catch (ContentFetchException cfe) {
        onContentFetchError(curURL);
    } catch (NotAllowedContentException nace) {
        logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL());
    } catch (Exception e) {
        onUnhandledException(curURL, e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
}
Also used : NotAllowedContentException(edu.uci.ics.crawler4j.parser.NotAllowedContentException) ArrayList(java.util.ArrayList) WebURL(edu.uci.ics.crawler4j.url.WebURL) NotAllowedContentException(edu.uci.ics.crawler4j.parser.NotAllowedContentException) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) ContentFetchException(edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException) ContentFetchException(edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) PageFetchResult(edu.uci.ics.crawler4j.fetcher.PageFetchResult) ParseData(edu.uci.ics.crawler4j.parser.ParseData) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException)

Example 2 with ParseData

use of edu.uci.ics.crawler4j.parser.ParseData in project crawler4j by yasserg.

the class Downloader method processUrl.

public void processUrl(String url) {
    logger.debug("Processing: {}", url);
    Page page = download(url);
    if (page != null) {
        ParseData parseData = page.getParseData();
        if (parseData != null) {
            if (parseData instanceof HtmlParseData) {
                HtmlParseData htmlParseData = (HtmlParseData) parseData;
                logger.debug("Title: {}", htmlParseData.getTitle());
                logger.debug("Text length: {}", htmlParseData.getText().length());
                logger.debug("Html length: {}", htmlParseData.getHtml().length());
            }
        } else {
            logger.warn("Couldn't parse the content of the page.");
        }
    } else {
        logger.warn("Couldn't fetch the content of the page.");
    }
    logger.debug("==============");
}
Also used : ParseData(edu.uci.ics.crawler4j.parser.ParseData) HtmlParseData(edu.uci.ics.crawler4j.parser.HtmlParseData) Page(edu.uci.ics.crawler4j.crawler.Page) HtmlParseData(edu.uci.ics.crawler4j.parser.HtmlParseData)

Aggregations

ParseData (edu.uci.ics.crawler4j.parser.ParseData)2 Page (edu.uci.ics.crawler4j.crawler.Page)1 ContentFetchException (edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException)1 PageBiggerThanMaxSizeException (edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException)1 ParseException (edu.uci.ics.crawler4j.crawler.exceptions.ParseException)1 PageFetchResult (edu.uci.ics.crawler4j.fetcher.PageFetchResult)1 HtmlParseData (edu.uci.ics.crawler4j.parser.HtmlParseData)1 NotAllowedContentException (edu.uci.ics.crawler4j.parser.NotAllowedContentException)1 WebURL (edu.uci.ics.crawler4j.url.WebURL)1 ArrayList (java.util.ArrayList)1