Search in sources :

Example 1 with RobotsTxtException

use of org.codelibs.fess.crawler.exception.RobotsTxtException in project fess-crawler by codelibs.

the class RobotsTxtHelper method parse.

public RobotsTxt parse(final InputStream stream, final String charsetName) {
    if (!enabled) {
        return null;
    }
    try {
        @SuppressWarnings("resource") final BufferedReader reader = new BufferedReader(new InputStreamReader(new BOMInputStream(stream), charsetName));
        String line;
        final RobotsTxt robotsTxt = new RobotsTxt();
        final List<Directive> currentDirectiveList = new ArrayList<>();
        boolean isGroupRecodeStarted = false;
        while ((line = reader.readLine()) != null) {
            line = stripComment(line).trim();
            if (StringUtil.isEmpty(line)) {
                continue;
            }
            String value;
            if ((value = getValue(USER_AGENT_RECORD, line)) != null) {
                if (isGroupRecodeStarted) {
                    currentDirectiveList.clear();
                    isGroupRecodeStarted = false;
                }
                final String userAgent = value.toLowerCase(Locale.ENGLISH);
                Directive currentDirective = robotsTxt.getDirective(userAgent);
                if (currentDirective == null) {
                    currentDirective = new Directive(userAgent);
                    robotsTxt.addDirective(currentDirective);
                    currentDirectiveList.add(currentDirective);
                }
            } else {
                isGroupRecodeStarted = true;
                if ((value = getValue(DISALLOW_RECORD, line)) != null) {
                    if (!currentDirectiveList.isEmpty() && value.length() > 0) {
                        for (final Directive directive : currentDirectiveList) {
                            directive.addDisallow(value);
                        }
                    }
                } else if ((value = getValue(ALLOW_RECORD, line)) != null) {
                    if (!currentDirectiveList.isEmpty() && value.length() > 0) {
                        for (final Directive directive : currentDirectiveList) {
                            directive.addAllow(value);
                        }
                    }
                } else if ((value = getValue(CRAWL_DELAY_RECORD, line)) != null) {
                    if (!currentDirectiveList.isEmpty()) {
                        try {
                            final int crawlDelay = Integer.parseInt(value);
                            for (final Directive directive : currentDirectiveList) {
                                directive.setCrawlDelay(Math.max(0, crawlDelay));
                            }
                        } catch (final NumberFormatException e) {
                        // ignore
                        }
                    }
                } else if ((value = getValue(SITEMAP_RECORD, line)) != null) {
                    if (value.length() > 0) {
                        robotsTxt.addSitemap(value);
                    }
                }
            }
        }
        return robotsTxt;
    } catch (final Exception e) {
        throw new RobotsTxtException("Failed to parse robots.txt.", e);
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) ArrayList(java.util.ArrayList) RobotsTxtException(org.codelibs.fess.crawler.exception.RobotsTxtException) BOMInputStream(org.apache.commons.io.input.BOMInputStream) RobotsTxt(org.codelibs.fess.crawler.entity.RobotsTxt) RobotsTxtException(org.codelibs.fess.crawler.exception.RobotsTxtException) BufferedReader(java.io.BufferedReader) Directive(org.codelibs.fess.crawler.entity.RobotsTxt.Directive)

Aggregations

BufferedReader (java.io.BufferedReader)1 InputStreamReader (java.io.InputStreamReader)1 ArrayList (java.util.ArrayList)1 BOMInputStream (org.apache.commons.io.input.BOMInputStream)1 RobotsTxt (org.codelibs.fess.crawler.entity.RobotsTxt)1 Directive (org.codelibs.fess.crawler.entity.RobotsTxt.Directive)1 RobotsTxtException (org.codelibs.fess.crawler.exception.RobotsTxtException)1