Search in sources :

Example 1 with RobotsTxt

use of org.codelibs.fess.crawler.entity.RobotsTxt in project fess-crawler by codelibs.

the class HcHttpClient method processRobotsTxt.

protected void processRobotsTxt(final String url) {
    if (StringUtil.isBlank(url)) {
        throw new CrawlerSystemException("url is null or empty.");
    }
    if (robotsTxtHelper == null || !robotsTxtHelper.isEnabled()) {
        // not support robots.txt
        return;
    }
    // crawler context
    final CrawlerContext crawlerContext = CrawlingParameterUtil.getCrawlerContext();
    if (crawlerContext == null) {
        // wrong state
        return;
    }
    final int idx = url.indexOf('/', url.indexOf("://") + 3);
    String hostUrl;
    if (idx >= 0) {
        hostUrl = url.substring(0, idx);
    } else {
        hostUrl = url;
    }
    final String robotTxtUrl = hostUrl + "/robots.txt";
    // check url
    if (crawlerContext.getRobotsTxtUrlSet().contains(robotTxtUrl)) {
        if (logger.isDebugEnabled()) {
            logger.debug(robotTxtUrl + " is already visited.");
        }
        return;
    }
    if (logger.isInfoEnabled()) {
        logger.info("Checking URL: " + robotTxtUrl);
    }
    // add url to a set
    crawlerContext.getRobotsTxtUrlSet().add(robotTxtUrl);
    final HttpGet httpGet = new HttpGet(robotTxtUrl);
    // request header
    for (final Header header : requestHeaderList) {
        httpGet.addHeader(header);
    }
    HttpEntity httpEntity = null;
    try {
        // get a content
        final HttpResponse response = executeHttpClient(httpGet);
        httpEntity = response.getEntity();
        final int httpStatusCode = response.getStatusLine().getStatusCode();
        if (httpStatusCode == 200) {
            // check file size
            final Header contentLengthHeader = response.getFirstHeader("Content-Length");
            if (contentLengthHeader != null) {
                final String value = contentLengthHeader.getValue();
                final long contentLength = Long.parseLong(value);
                if (contentLengthHelper != null) {
                    final long maxLength = contentLengthHelper.getMaxLength("text/plain");
                    if (contentLength > maxLength) {
                        throw new MaxLengthExceededException("The content length (" + contentLength + " byte) is over " + maxLength + " byte. The url is " + robotTxtUrl);
                    }
                }
            }
            if (httpEntity != null) {
                final RobotsTxt robotsTxt = robotsTxtHelper.parse(httpEntity.getContent());
                if (robotsTxt != null) {
                    final String[] sitemaps = robotsTxt.getSitemaps();
                    if (sitemaps.length > 0) {
                        crawlerContext.addSitemaps(sitemaps);
                    }
                    final RobotsTxt.Directive directive = robotsTxt.getMatchedDirective(userAgent);
                    if (directive != null) {
                        if (useRobotsTxtDisallows) {
                            for (String urlPattern : directive.getDisallows()) {
                                if (StringUtil.isNotBlank(urlPattern)) {
                                    urlPattern = convertRobotsTxtPathPattern(urlPattern);
                                    crawlerContext.getUrlFilter().addExclude(hostUrl + urlPattern);
                                }
                            }
                        }
                        if (useRobotsTxtAllows) {
                            for (String urlPattern : directive.getAllows()) {
                                if (StringUtil.isNotBlank(urlPattern)) {
                                    urlPattern = convertRobotsTxtPathPattern(urlPattern);
                                    crawlerContext.getUrlFilter().addInclude(hostUrl + urlPattern);
                                }
                            }
                        }
                    }
                }
            }
        }
    } catch (final CrawlerSystemException e) {
        httpGet.abort();
        throw e;
    } catch (final Exception e) {
        httpGet.abort();
        throw new CrawlingAccessException("Could not process " + robotTxtUrl + ". ", e);
    } finally {
        EntityUtils.consumeQuietly(httpEntity);
    }
}
Also used : HttpEntity(org.apache.http.HttpEntity) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) HttpGet(org.apache.http.client.methods.HttpGet) HttpResponse(org.apache.http.HttpResponse) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ParseException(java.text.ParseException) NoRouteToHostException(java.net.NoRouteToHostException) SocketException(java.net.SocketException) ConnectException(java.net.ConnectException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) CrawlerContext(org.codelibs.fess.crawler.CrawlerContext) Header(org.apache.http.Header) BasicHeader(org.apache.http.message.BasicHeader) RobotsTxt(org.codelibs.fess.crawler.entity.RobotsTxt) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException)

Example 2 with RobotsTxt

use of org.codelibs.fess.crawler.entity.RobotsTxt in project fess-crawler by codelibs.

the class RobotsTxtHelper method parse.

public RobotsTxt parse(final InputStream stream, final String charsetName) {
    if (!enabled) {
        return null;
    }
    try {
        @SuppressWarnings("resource") final BufferedReader reader = new BufferedReader(new InputStreamReader(new BOMInputStream(stream), charsetName));
        String line;
        final RobotsTxt robotsTxt = new RobotsTxt();
        final List<Directive> currentDirectiveList = new ArrayList<>();
        boolean isGroupRecodeStarted = false;
        while ((line = reader.readLine()) != null) {
            line = stripComment(line).trim();
            if (StringUtil.isEmpty(line)) {
                continue;
            }
            String value;
            if ((value = getValue(USER_AGENT_RECORD, line)) != null) {
                if (isGroupRecodeStarted) {
                    currentDirectiveList.clear();
                    isGroupRecodeStarted = false;
                }
                final String userAgent = value.toLowerCase(Locale.ENGLISH);
                Directive currentDirective = robotsTxt.getDirective(userAgent);
                if (currentDirective == null) {
                    currentDirective = new Directive(userAgent);
                    robotsTxt.addDirective(currentDirective);
                    currentDirectiveList.add(currentDirective);
                }
            } else {
                isGroupRecodeStarted = true;
                if ((value = getValue(DISALLOW_RECORD, line)) != null) {
                    if (!currentDirectiveList.isEmpty() && value.length() > 0) {
                        for (final Directive directive : currentDirectiveList) {
                            directive.addDisallow(value);
                        }
                    }
                } else if ((value = getValue(ALLOW_RECORD, line)) != null) {
                    if (!currentDirectiveList.isEmpty() && value.length() > 0) {
                        for (final Directive directive : currentDirectiveList) {
                            directive.addAllow(value);
                        }
                    }
                } else if ((value = getValue(CRAWL_DELAY_RECORD, line)) != null) {
                    if (!currentDirectiveList.isEmpty()) {
                        try {
                            final int crawlDelay = Integer.parseInt(value);
                            for (final Directive directive : currentDirectiveList) {
                                directive.setCrawlDelay(Math.max(0, crawlDelay));
                            }
                        } catch (final NumberFormatException e) {
                        // ignore
                        }
                    }
                } else if ((value = getValue(SITEMAP_RECORD, line)) != null) {
                    if (value.length() > 0) {
                        robotsTxt.addSitemap(value);
                    }
                }
            }
        }
        return robotsTxt;
    } catch (final Exception e) {
        throw new RobotsTxtException("Failed to parse robots.txt.", e);
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) ArrayList(java.util.ArrayList) RobotsTxtException(org.codelibs.fess.crawler.exception.RobotsTxtException) BOMInputStream(org.apache.commons.io.input.BOMInputStream) RobotsTxt(org.codelibs.fess.crawler.entity.RobotsTxt) RobotsTxtException(org.codelibs.fess.crawler.exception.RobotsTxtException) BufferedReader(java.io.BufferedReader) Directive(org.codelibs.fess.crawler.entity.RobotsTxt.Directive)

Example 3 with RobotsTxt

use of org.codelibs.fess.crawler.entity.RobotsTxt in project fess-crawler by codelibs.

the class RobotsTxtHelperTest method testParse.

public void testParse() {
    RobotsTxt robotsTxt;
    final InputStream in = RobotsTxtHelperTest.class.getResourceAsStream("robots.txt");
    try {
        robotsTxt = robotsTxtHelper.parse(in);
    } finally {
        CloseableUtil.closeQuietly(in);
    }
    for (String userAgent : new String[] { "FessCrawler", "FessCrawler/1.0", "Mozilla FessCrawler" }) {
        assertTrue(robotsTxt.allows("/aaa", userAgent));
        assertTrue(robotsTxt.allows("/private/", userAgent));
        assertTrue(robotsTxt.allows("/private/index.html", userAgent));
        assertTrue(robotsTxt.allows("/help/", userAgent));
        assertTrue(robotsTxt.allows("/help.html", userAgent));
        assertTrue(robotsTxt.allows("/help/faq.html", userAgent));
        assertTrue(robotsTxt.allows("/foo/bar/", userAgent));
        assertTrue(robotsTxt.allows("/foo/bar/index.html", userAgent));
        assertEquals(0, robotsTxt.getCrawlDelay(userAgent));
    }
    for (String userAgent : new String[] { "BruteBot", "FOO BruteBot/1.0" }) {
        assertFalse(robotsTxt.allows("/aaa", userAgent));
        assertFalse(robotsTxt.allows("/private/", userAgent));
        assertFalse(robotsTxt.allows("/private/index.html", userAgent));
        assertFalse(robotsTxt.allows("/help/", userAgent));
        assertFalse(robotsTxt.allows("/help.html", userAgent));
        assertFalse(robotsTxt.allows("/help/faq.html", userAgent));
        assertTrue(robotsTxt.allows("/foo/bar/", userAgent));
        assertTrue(robotsTxt.allows("/foo/bar/index.html", userAgent));
        assertEquals(1314000, robotsTxt.getCrawlDelay(userAgent));
    }
    for (String userAgent : new String[] { "GOOGLEBOT", "GoogleBot", "googlebot" }) {
        assertTrue(robotsTxt.allows("/aaa", userAgent));
        assertTrue(robotsTxt.allows("/private/", userAgent));
        assertTrue(robotsTxt.allows("/private/index.html", userAgent));
        assertTrue(robotsTxt.allows("/help/", userAgent));
        assertTrue(robotsTxt.allows("/help.html", userAgent));
        assertTrue(robotsTxt.allows("/help/faq.html", userAgent));
        assertTrue(robotsTxt.allows("/foo/bar/", userAgent));
        assertTrue(robotsTxt.allows("/foo/bar/index.html", userAgent));
        assertEquals(1, robotsTxt.getCrawlDelay(userAgent));
    }
    for (String userAgent : new String[] { "UnknownBot", "", " ", null }) {
        assertTrue(robotsTxt.allows("/aaa", userAgent));
        assertFalse(robotsTxt.allows("/private/", userAgent));
        assertFalse(robotsTxt.allows("/private/index.html", userAgent));
        assertFalse(robotsTxt.allows("/help/", userAgent));
        assertFalse(robotsTxt.allows("/help.html", userAgent));
        assertTrue(robotsTxt.allows("/help/faq.html", userAgent));
        assertTrue(robotsTxt.allows("/foo/bar/", userAgent));
        assertTrue(robotsTxt.allows("/foo/bar/index.html", userAgent));
        assertEquals(3, robotsTxt.getCrawlDelay(userAgent));
    }
    assertFalse(robotsTxt.allows("/aaa", "Crawler"));
    assertTrue(robotsTxt.allows("/bbb", "Crawler"));
    assertTrue(robotsTxt.allows("/ccc", "Crawler"));
    assertTrue(robotsTxt.allows("/ddd", "Crawler"));
    assertTrue(robotsTxt.allows("/aaa", "Crawler/1.0"));
    assertFalse(robotsTxt.allows("/bbb", "Crawler/1.0"));
    assertTrue(robotsTxt.allows("/ccc", "Crawler/1.0"));
    assertTrue(robotsTxt.allows("/ddd", "Crawler/1.0"));
    assertTrue(robotsTxt.allows("/aaa", "Crawler/2.0"));
    assertTrue(robotsTxt.allows("/bbb", "Crawler/2.0"));
    assertFalse(robotsTxt.allows("/ccc", "Crawler/2.0"));
    assertTrue(robotsTxt.allows("/ddd", "Crawler/2.0"));
    assertTrue(robotsTxt.allows("/aaa", "Hoge Crawler"));
    assertTrue(robotsTxt.allows("/bbb", "Hoge Crawler"));
    assertTrue(robotsTxt.allows("/ccc", "Hoge Crawler"));
    assertFalse(robotsTxt.allows("/ddd", "Hoge Crawler"));
    String[] sitemaps = robotsTxt.getSitemaps();
    assertEquals(2, sitemaps.length);
    assertEquals("http://www.example.com/sitmap.xml", sitemaps[0]);
    assertEquals("http://www.example.net/sitmap.xml", sitemaps[1]);
}
Also used : RobotsTxt(org.codelibs.fess.crawler.entity.RobotsTxt) InputStream(java.io.InputStream)

Aggregations

RobotsTxt (org.codelibs.fess.crawler.entity.RobotsTxt)3 BufferedReader (java.io.BufferedReader)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 ConnectException (java.net.ConnectException)1 MalformedURLException (java.net.MalformedURLException)1 NoRouteToHostException (java.net.NoRouteToHostException)1 SocketException (java.net.SocketException)1 UnknownHostException (java.net.UnknownHostException)1 ParseException (java.text.ParseException)1 ArrayList (java.util.ArrayList)1 BOMInputStream (org.apache.commons.io.input.BOMInputStream)1 Header (org.apache.http.Header)1 HttpEntity (org.apache.http.HttpEntity)1 HttpResponse (org.apache.http.HttpResponse)1 HttpGet (org.apache.http.client.methods.HttpGet)1 BasicHeader (org.apache.http.message.BasicHeader)1 CrawlerContext (org.codelibs.fess.crawler.CrawlerContext)1 Directive (org.codelibs.fess.crawler.entity.RobotsTxt.Directive)1