Search in sources :

Example 1 with Page

use of edu.uci.ics.crawler4j.crawler.Page in project crawler4j by yasserg.

the class PageFetcherHtmlTest method testCustomPageFetcher.

@Test
public void testCustomPageFetcher() throws InterruptedException, PageBiggerThanMaxSizeException, IOException {
    WireMock.stubFor(WireMock.head(WireMock.urlEqualTo("/some/index.html")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "text/html")));
    WireMock.stubFor(WireMock.get(WireMock.urlEqualTo("/some/index.html")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "text/html").withHeader("Content-Length", "47").withBody("<html><body><h1>this is " + "html</h1></body></html>")));
    WireMock.stubFor(WireMock.get(WireMock.urlEqualTo("/some/invoice.pdf")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "application/pdf").withBody(new byte[] { 1, 2, 3, 4 })));
    WireMock.stubFor(WireMock.head(WireMock.urlEqualTo("/some/invoice.pdf")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "application/pdf")));
    CrawlConfig cfg = new CrawlConfig();
    WebURL url = new WebURL();
    url.setURL("http://localhost:8080/some/index.html");
    PageFetcher pf = new PageFetcherHtmlOnly(cfg);
    pf.fetchPage(url).fetchContent(new Page(url), 47);
    WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/some/index.html")));
    WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/some/index.html")));
    url.setURL("http://localhost:8080/some/invoice.pdf");
    pf = new PageFetcherHtmlOnly(cfg);
    pf.fetchPage(url).fetchContent(new Page(url), 4);
    WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/some/invoice.pdf")));
    WireMock.verify(0, WireMock.getRequestedFor(WireMock.urlEqualTo("/some/invoice.pdf")));
}
Also used : PageFetcher(edu.uci.ics.crawler4j.fetcher.PageFetcher) CrawlConfig(edu.uci.ics.crawler4j.crawler.CrawlConfig) WebURL(edu.uci.ics.crawler4j.url.WebURL) Page(edu.uci.ics.crawler4j.crawler.Page) Test(org.junit.Test)

Example 2 with Page

use of edu.uci.ics.crawler4j.crawler.Page in project crawler4j by yasserg.

the class Downloader method download.

private Page download(String url) {
    WebURL curURL = new WebURL();
    curURL.setURL(url);
    PageFetchResult fetchResult = null;
    try {
        fetchResult = pageFetcher.fetchPage(curURL);
        if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
            Page page = new Page(curURL);
            fetchResult.fetchContent(page, config.getMaxDownloadSize());
            parser.parse(page, curURL.getURL());
            return page;
        }
    } catch (Exception e) {
        logger.error("Error occurred while fetching url: " + curURL.getURL(), e);
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
    return null;
}
Also used : PageFetchResult(edu.uci.ics.crawler4j.fetcher.PageFetchResult) WebURL(edu.uci.ics.crawler4j.url.WebURL) Page(edu.uci.ics.crawler4j.crawler.Page)

Example 3 with Page

use of edu.uci.ics.crawler4j.crawler.Page in project crawler4j by yasserg.

the class RobotstxtServer method fetchDirectives.

private HostDirectives fetchDirectives(URL url) throws IOException, InterruptedException {
    WebURL robotsTxtUrl = new WebURL();
    String host = getHost(url);
    String port = ((url.getPort() == url.getDefaultPort()) || (url.getPort() == -1)) ? "" : (":" + url.getPort());
    String proto = url.getProtocol();
    robotsTxtUrl.setURL(proto + "://" + host + port + "/robots.txt");
    HostDirectives directives = null;
    PageFetchResult fetchResult = null;
    try {
        for (int redir = 0; redir < 3; ++redir) {
            fetchResult = pageFetcher.fetchPage(robotsTxtUrl);
            int status = fetchResult.getStatusCode();
            // Follow redirects up to 3 levels
            if ((status == HttpStatus.SC_MULTIPLE_CHOICES || status == HttpStatus.SC_MOVED_PERMANENTLY || status == HttpStatus.SC_MOVED_TEMPORARILY || status == HttpStatus.SC_SEE_OTHER || status == HttpStatus.SC_TEMPORARY_REDIRECT || status == 308) && // SC_PERMANENT_REDIRECT RFC7538
            fetchResult.getMovedToUrl() != null) {
                robotsTxtUrl.setURL(fetchResult.getMovedToUrl());
                fetchResult.discardContentIfNotConsumed();
            } else {
                // Done on all other occasions
                break;
            }
        }
        if (fetchResult.getStatusCode() == HttpStatus.SC_OK) {
            Page page = new Page(robotsTxtUrl);
            // Most recent answer on robots.txt max size is
            // https://developers.google.com/search/reference/robots_txt
            fetchResult.fetchContent(page, 500 * 1024);
            if (Util.hasPlainTextContent(page.getContentType())) {
                String content;
                if (page.getContentCharset() == null) {
                    content = new String(page.getContentData());
                } else {
                    content = new String(page.getContentData(), page.getContentCharset());
                }
                directives = RobotstxtParser.parse(content, config);
            } else if (page.getContentType().contains("html")) {
                // TODO This one should be upgraded to remove all
                // html tags
                String content = new String(page.getContentData());
                directives = RobotstxtParser.parse(content, config);
            } else {
                logger.warn("Can't read this robots.txt: {}  as it is not written in plain text, " + "contentType: {}", robotsTxtUrl.getURL(), page.getContentType());
            }
        } else {
            logger.debug("Can't read this robots.txt: {}  as it's status code is {}", robotsTxtUrl.getURL(), fetchResult.getStatusCode());
        }
    } catch (SocketException | UnknownHostException | SocketTimeoutException | NoHttpResponseException se) {
        // No logging here, as it just means that robots.txt doesn't exist on this server
        // which is perfectly ok
        logger.trace("robots.txt probably does not exist.", se);
    } catch (PageBiggerThanMaxSizeException pbtms) {
        logger.error("Error occurred while fetching (robots) url: {}, {}", robotsTxtUrl.getURL(), pbtms.getMessage());
    } catch (IOException e) {
        logger.error("Error occurred while fetching (robots) url: " + robotsTxtUrl.getURL(), e);
    } catch (InterruptedException | RuntimeException e) {
        if (crawlConfig.isHaltOnError()) {
            throw e;
        } else {
            logger.error("Error occurred while fetching (robots) url: " + robotsTxtUrl.getURL(), e);
        }
    } finally {
        if (fetchResult != null) {
            fetchResult.discardContentIfNotConsumed();
        }
    }
    if (directives == null) {
        // We still need to have this object to keep track of the time we fetched it
        directives = new HostDirectives(config);
    }
    synchronized (host2directivesCache) {
        if (host2directivesCache.size() == config.getCacheSize()) {
            String minHost = null;
            long minAccessTime = Long.MAX_VALUE;
            for (Map.Entry<String, HostDirectives> entry : host2directivesCache.entrySet()) {
                long entryAccessTime = entry.getValue().getLastAccessTime();
                if (entryAccessTime < minAccessTime) {
                    minAccessTime = entryAccessTime;
                    minHost = entry.getKey();
                }
            }
            host2directivesCache.remove(minHost);
        }
        host2directivesCache.put(host, directives);
    }
    return directives;
}
Also used : NoHttpResponseException(org.apache.http.NoHttpResponseException) SocketException(java.net.SocketException) UnknownHostException(java.net.UnknownHostException) WebURL(edu.uci.ics.crawler4j.url.WebURL) Page(edu.uci.ics.crawler4j.crawler.Page) IOException(java.io.IOException) SocketTimeoutException(java.net.SocketTimeoutException) PageBiggerThanMaxSizeException(edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException) PageFetchResult(edu.uci.ics.crawler4j.fetcher.PageFetchResult) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with Page

use of edu.uci.ics.crawler4j.crawler.Page in project crawler4j by yasserg.

the class Downloader method processUrl.

public void processUrl(String url) {
    logger.debug("Processing: {}", url);
    Page page = download(url);
    if (page != null) {
        ParseData parseData = page.getParseData();
        if (parseData != null) {
            if (parseData instanceof HtmlParseData) {
                HtmlParseData htmlParseData = (HtmlParseData) parseData;
                logger.debug("Title: {}", htmlParseData.getTitle());
                logger.debug("Text length: {}", htmlParseData.getText().length());
                logger.debug("Html length: {}", htmlParseData.getHtml().length());
            }
        } else {
            logger.warn("Couldn't parse the content of the page.");
        }
    } else {
        logger.warn("Couldn't fetch the content of the page.");
    }
    logger.debug("==============");
}
Also used : ParseData(edu.uci.ics.crawler4j.parser.ParseData) HtmlParseData(edu.uci.ics.crawler4j.parser.HtmlParseData) Page(edu.uci.ics.crawler4j.crawler.Page) HtmlParseData(edu.uci.ics.crawler4j.parser.HtmlParseData)

Example 5 with Page

use of edu.uci.ics.crawler4j.crawler.Page in project crawler4j by yasserg.

the class PageFetcherHtmlTest method testCustomPageFetcher.

@Test
public void testCustomPageFetcher() throws Exception {
    WireMock.stubFor(WireMock.head(WireMock.urlEqualTo("/some/index.html")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "text/html")));
    WireMock.stubFor(WireMock.get(WireMock.urlEqualTo("/some/index.html")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "text/html").withHeader("Content-Length", "47").withBody("<html><body><h1>this is " + "html</h1></body></html>")));
    WireMock.stubFor(WireMock.get(WireMock.urlEqualTo("/some/invoice.pdf")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "application/pdf").withBody(new byte[] { 1, 2, 3, 4 })));
    WireMock.stubFor(WireMock.head(WireMock.urlEqualTo("/some/invoice.pdf")).willReturn(WireMock.aResponse().withStatus(200).withHeader("Content-Type", "application/pdf")));
    CrawlConfig cfg = new CrawlConfig();
    WebURL url = new WebURL();
    url.setURL("http://localhost:" + wireMockRule.port() + "/some/index.html");
    PageFetcher pf = new PageFetcherHtmlOnly(cfg);
    pf.fetchPage(url).fetchContent(new Page(url), 47);
    WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/some/index.html")));
    WireMock.verify(1, WireMock.getRequestedFor(WireMock.urlEqualTo("/some/index.html")));
    url.setURL("http://localhost:" + wireMockRule.port() + "/some/invoice.pdf");
    pf = new PageFetcherHtmlOnly(cfg);
    pf.fetchPage(url).fetchContent(new Page(url), 4);
    WireMock.verify(1, WireMock.headRequestedFor(WireMock.urlEqualTo("/some/invoice.pdf")));
    WireMock.verify(0, WireMock.getRequestedFor(WireMock.urlEqualTo("/some/invoice.pdf")));
}
Also used : PageFetcher(edu.uci.ics.crawler4j.fetcher.PageFetcher) CrawlConfig(edu.uci.ics.crawler4j.crawler.CrawlConfig) WebURL(edu.uci.ics.crawler4j.url.WebURL) Page(edu.uci.ics.crawler4j.crawler.Page) Test(org.junit.Test)

Aggregations

Page (edu.uci.ics.crawler4j.crawler.Page)5 WebURL (edu.uci.ics.crawler4j.url.WebURL)4 CrawlConfig (edu.uci.ics.crawler4j.crawler.CrawlConfig)2 PageFetchResult (edu.uci.ics.crawler4j.fetcher.PageFetchResult)2 PageFetcher (edu.uci.ics.crawler4j.fetcher.PageFetcher)2 Test (org.junit.Test)2 PageBiggerThanMaxSizeException (edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException)1 HtmlParseData (edu.uci.ics.crawler4j.parser.HtmlParseData)1 ParseData (edu.uci.ics.crawler4j.parser.ParseData)1 IOException (java.io.IOException)1 SocketException (java.net.SocketException)1 SocketTimeoutException (java.net.SocketTimeoutException)1 UnknownHostException (java.net.UnknownHostException)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 NoHttpResponseException (org.apache.http.NoHttpResponseException)1