Search in sources :

Example 1 with ParseException

use of edu.uci.ics.crawler4j.crawler.exceptions.ParseException in project crawler4j by yasserg.

the class Parser method parse.

public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
    if (Util.hasBinaryContent(page.getContentType())) {
        // BINARY
        BinaryParseData parseData = new BinaryParseData();
        if (config.isIncludeBinaryContentInCrawling()) {
            if (config.isProcessBinaryContentInCrawling()) {
                parseData.setBinaryContent(page.getContentData());
            } else {
                parseData.setHtml("<html></html>");
            }
            page.setParseData(parseData);
            if (parseData.getHtml() == null) {
                throw new ParseException();
            }
            parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
        } else {
            throw new NotAllowedContentException();
        }
    } else if (Util.hasPlainTextContent(page.getContentType())) {
        // plain Text
        try {
            TextParseData parseData = new TextParseData();
            if (page.getContentCharset() == null) {
                parseData.setTextContent(new String(page.getContentData()));
            } else {
                parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
            }
            parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
            page.setParseData(parseData);
        } catch (Exception e) {
            logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
            throw new ParseException();
        }
    } else {
        // isHTML
        Metadata metadata = new Metadata();
        HtmlContentHandler contentHandler = new HtmlContentHandler();
        try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
            htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
        } catch (Exception e) {
            logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
            throw new ParseException();
        }
        if (page.getContentCharset() == null) {
            page.setContentCharset(metadata.get("Content-Encoding"));
        }
        HtmlParseData parseData = new HtmlParseData();
        parseData.setText(contentHandler.getBodyText().trim());
        parseData.setTitle(metadata.get(DublinCore.TITLE));
        parseData.setMetaTags(contentHandler.getMetaTags());
        // Please note that identifying language takes less than 10 milliseconds
        LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText());
        page.setLanguage(languageIdentifier.getLanguage());
        Set<WebURL> outgoingUrls = new HashSet<>();
        String baseURL = contentHandler.getBaseUrl();
        if (baseURL != null) {
            contextURL = baseURL;
        }
        int urlCount = 0;
        for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
            String href = urlAnchorPair.getHref();
            if ((href == null) || href.trim().isEmpty()) {
                continue;
            }
            String hrefLoweredCase = href.trim().toLowerCase();
            if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:") && !hrefLoweredCase.contains("@")) {
                String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
                if (url != null) {
                    WebURL webURL = new WebURL();
                    webURL.setURL(url);
                    webURL.setTag(urlAnchorPair.getTag());
                    webURL.setAnchor(urlAnchorPair.getAnchor());
                    outgoingUrls.add(webURL);
                    urlCount++;
                    if (urlCount > config.getMaxOutgoingLinksToFollow()) {
                        break;
                    }
                }
            }
        }
        parseData.setOutgoingUrls(outgoingUrls);
        try {
            if (page.getContentCharset() == null) {
                parseData.setHtml(new String(page.getContentData()));
            } else {
                parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
            }
            page.setParseData(parseData);
        } catch (UnsupportedEncodingException e) {
            logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
            throw new ParseException();
        }
    }
}
Also used : Set(java.util.Set) HashSet(java.util.HashSet) ByteArrayInputStream(java.io.ByteArrayInputStream) InputStream(java.io.InputStream) Metadata(org.apache.tika.metadata.Metadata) WebURL(edu.uci.ics.crawler4j.url.WebURL) UnsupportedEncodingException(java.io.UnsupportedEncodingException) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) LanguageIdentifier(org.apache.tika.language.LanguageIdentifier) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException)

Example 2 with ParseException

use of edu.uci.ics.crawler4j.crawler.exceptions.ParseException in project crawler4j by yasserg.

the class WebCrawler method processPage.

private void processPage(WebURL curURL) {
    PageFetchResult fetchResult = null;
    try {
        if (curURL == null) {
            return;
        }
        fetchResult = pageFetcher.fetchPage(curURL);
        int statusCode = fetchResult.getStatusCode();
        handlePageStatusCode(curURL, statusCode, EnglishReasonPhraseCatalog.INSTANCE.getReason(statusCode, Locale.ENGLISH));
        // Finds the status reason for all known statuses
        Page page = new Page(curURL);
        page.setFetchResponseHeaders(fetchResult.getResponseHeaders());
        page.setStatusCode(statusCode);
        if (statusCode < 200 || statusCode > 299) {
            // Not 2XX: 2XX status codes indicate success
            if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY || statusCode == HttpStatus.SC_MULTIPLE_CHOICES || statusCode == HttpStatus.SC_SEE_OTHER || statusCode == HttpStatus.SC_TEMPORARY_REDIRECT || statusCode == 308) {
                // is 3xx  todo
                // follow https://issues.apache.org/jira/browse/HTTPCORE-389
                page.setRedirect(true);
                String movedToUrl = fetchResult.getMovedToUrl();
                if (movedToUrl == null) {
                    logger.warn("Unexpected error, URL: {} is redirected to NOTHING", curURL);
                    return;
                }
                page.setRedirectedToUrl(movedToUrl);
                onRedirectedStatusCode(page);
                if (myController.getConfig().isFollowRedirects()) {
                    int newDocId = docIdServer.getDocId(movedToUrl);
                    if (newDocId > 0) {
                        logger.debug("Redirect page: {} is already seen", curURL);
                        return;
                    }
                    WebURL webURL = new WebURL();
                    webURL.setURL(movedToUrl);
                    webURL.setParentDocid(curURL.getParentDocid());
                    webURL.setParentUrl(curURL.getParentUrl());
                    webURL.setDepth(curURL.getDepth());
                    webURL.setDocid(-1);
                    webURL.setAnchor(curURL.getAnchor());
                    if (shouldVisit(page, webURL)) {
                        if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
                            webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
                            frontier.schedule(webURL);
                        } else {
                            logger.debug("Not visiting: {} as per the server's \"robots.txt\" policy", webURL.getURL());
                        }
                    } else {
                        logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                    }
                }
            } else {
                // All other http codes other than 3xx & 200
                String description = EnglishReasonPhraseCatalog.INSTANCE.getReason(fetchResult.getStatusCode(), // Finds
                Locale.ENGLISH);
                // the status reason for all known statuses
                String contentType = fetchResult.getEntity() == null ? "" : fetchResult.getEntity().getContentType() == null ? "" : fetchResult.getEntity().getContentType().getValue();
                onUnexpectedStatusCode(curURL.getURL(), fetchResult.getStatusCode(), contentType, description);
            }
        } else {
            // if status code is 200
            if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
                if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
                    logger.debug("Redirect page: {} has already been seen", curURL);
                    return;
                }
                curURL.setURL(fetchResult.getFetchedUrl());
                curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
            }
            if (!fetchResult.fetchContent(page, myController.getConfig().getMaxDownloadSize())) {
                throw new ContentFetchException();
            }
            if (page.isTruncated()) {
                logger.warn("Warning: unknown page size exceeded max-download-size, truncated to: " + "({}), at URL: {}", myController.getConfig().getMaxDownloadSize(), curURL.getURL());
            }
            parser.parse(page, curURL.getURL());
            if (shouldFollowLinksIn(page.getWebURL())) {
                ParseData parseData = page.getParseData();
                List<WebURL> toSchedule = new ArrayList<>();
                int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
                for (WebURL webURL : parseData.getOutgoingUrls()) {
                    webURL.setParentDocid(curURL.getDocid());
                    webURL.setParentUrl(curURL.getURL());
                    int newdocid = docIdServer.getDocId(webURL.getURL());
                    if (newdocid > 0) {
                        // This is not the first time that this Url is visited. So, we set the
                        // depth to a negative number.
                        webURL.setDepth((short) -1);
                        webURL.setDocid(newdocid);
                    } else {
                        webURL.setDocid(-1);
                        webURL.setDepth((short) (curURL.getDepth() + 1));
                        if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
                            if (shouldVisit(page, webURL)) {
                                if (robotstxtServer.allows(webURL)) {
                                    webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
                                    toSchedule.add(webURL);
                                } else {
                                    logger.debug("Not visiting: {} as per the server's \"robots.txt\" " + "policy", webURL.getURL());
                                }
                            } else {
                                logger.debug("Not visiting: {} as per your \"shouldVisit\" policy", webURL.getURL());
                            }
                        }
                    }
                }
                frontier.scheduleAll(toSchedule);
            } else {
                logger.debug("Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy", page.getWebURL().getURL());
            }
            visit(page);
        }
    } catch (PageBiggerThanMaxSizeException e) {
        onPageBiggerThanMaxSize(curURL.getURL(), e.getPageSize());
    } catch (ParseException pe) {
        onParseError(curURL);
    } catch (ContentFetchException cfe) {
        onContentFetchError(curURL);
    } catch (NotAllowedContentException nace) {
        logger.debug("Skipping: {} as it contains binary content which you configured not to crawl", curURL.getURL());
    } catch (Exception e) {
        onUnhandledException(curURL, e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
}
Also used : NotAllowedContentException(edu.uci.ics.crawler4j.parser.NotAllowedContentException) ArrayList(java.util.ArrayList) WebURL(edu.uci.ics.crawler4j.url.WebURL) NotAllowedContentException(edu.uci.ics.crawler4j.parser.NotAllowedContentException) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) ContentFetchException(edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException) ContentFetchException(edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) PageFetchResult(edu.uci.ics.crawler4j.fetcher.PageFetchResult) ParseData(edu.uci.ics.crawler4j.parser.ParseData) ParseException(edu.uci.ics.crawler4j.crawler.exceptions.ParseException)

Aggregations

ParseException (edu.uci.ics.crawler4j.crawler.exceptions.ParseException)2 WebURL (edu.uci.ics.crawler4j.url.WebURL)2 ContentFetchException (edu.uci.ics.crawler4j.crawler.exceptions.ContentFetchException)1 PageBiggerThanMaxSizeException (edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException)1 PageFetchResult (edu.uci.ics.crawler4j.fetcher.PageFetchResult)1 NotAllowedContentException (edu.uci.ics.crawler4j.parser.NotAllowedContentException)1 ParseData (edu.uci.ics.crawler4j.parser.ParseData)1 ByteArrayInputStream (java.io.ByteArrayInputStream)1 InputStream (java.io.InputStream)1 UnsupportedEncodingException (java.io.UnsupportedEncodingException)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 Set (java.util.Set)1 LanguageIdentifier (org.apache.tika.language.LanguageIdentifier)1 Metadata (org.apache.tika.metadata.Metadata)1