Search in sources :

Example 1 with BaseRobotRules

use of crawlercommons.robots.BaseRobotRules in project nutch by apache.

the class HttpRobotRulesParser method getRobotRulesSet.

/**
 * Get the rules from robots.txt which applies for the given {@code url}.
 * Robot rules are cached for a unique combination of host, protocol, and
 * port. If no rules are found in the cache, a HTTP request is send to fetch
 * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
 * rules are cached to avoid re-fetching and re-parsing it again.
 *
 * @param http
 *          The {@link Protocol} object
 * @param url
 *          URL
 * @param robotsTxtContent
 *          container to store responses when fetching the robots.txt file for
 *          debugging or archival purposes. Instead of a robots.txt file, it
 *          may include redirects or an error page (404, etc.). Response
 *          {@link Content} is appended to the passed list. If null is passed
 *          nothing is stored.
 *
 * @return robotRules A {@link BaseRobotRules} object for the rules
 */
@Override
public BaseRobotRules getRobotRulesSet(Protocol http, URL url, List<Content> robotsTxtContent) {
    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
        LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
    }
    String cacheKey = getCacheKey(url);
    BaseRobotRules robotRules = CACHE.get(cacheKey);
    if (robotRules != null) {
        // cached rule
        return robotRules;
    } else if (LOG.isTraceEnabled()) {
        LOG.trace("cache miss " + url);
    }
    boolean cacheRule = true;
    URL redir = null;
    if (isWhiteListed(url)) {
        // check in advance whether a host is whitelisted
        // (we do not need to fetch robots.txt)
        robotRules = EMPTY_RULES;
        LOG.info("Whitelisted host found for: {}", url);
        LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", url.getHost());
    } else {
        try {
            URL robotsUrl = new URL(url, "/robots.txt");
            Response response = ((HttpBase) http).getResponse(robotsUrl, new CrawlDatum(), true);
            if (robotsTxtContent != null) {
                addRobotsContent(robotsTxtContent, robotsUrl, response);
            }
            // try one level of redirection ?
            if (response.getCode() == 301 || response.getCode() == 302) {
                String redirection = response.getHeader("Location");
                if (redirection == null) {
                    // some versions of MS IIS are known to mangle this header
                    redirection = response.getHeader("location");
                }
                if (redirection != null) {
                    if (!redirection.startsWith("http")) {
                        // RFC says it should be absolute, but apparently it isn't
                        redir = new URL(url, redirection);
                    } else {
                        redir = new URL(redirection);
                    }
                    response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), true);
                    if (robotsTxtContent != null) {
                        addRobotsContent(robotsTxtContent, redir, response);
                    }
                }
            }
            if (// found rules: parse them
            response.getCode() == 200)
                robotRules = parseRules(url.toString(), response.getContent(), response.getHeader("Content-Type"), agentNames);
            else if ((response.getCode() == 403) && (!allowForbidden))
                // use forbid all
                robotRules = FORBID_ALL_RULES;
            else if (response.getCode() >= 500) {
                // try again later to fetch robots.txt
                cacheRule = false;
                robotRules = EMPTY_RULES;
            } else
                // use default rules
                robotRules = EMPTY_RULES;
        } catch (Throwable t) {
            if (LOG.isInfoEnabled()) {
                LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
            }
            // try again later to fetch robots.txt
            cacheRule = false;
            robotRules = EMPTY_RULES;
        }
    }
    if (cacheRule) {
        // cache rules for host
        CACHE.put(cacheKey, robotRules);
        if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
            // cache also for the redirected host
            CACHE.put(getCacheKey(redir), robotRules);
        }
    }
    return robotRules;
}
Also used : Response(org.apache.nutch.net.protocols.Response) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) BaseRobotRules(crawlercommons.robots.BaseRobotRules) URL(java.net.URL)

Example 2 with BaseRobotRules

use of crawlercommons.robots.BaseRobotRules in project nutch by apache.

the class FtpRobotRulesParser method getRobotRulesSet.

/**
 * The hosts for which the caching of robots rules is yet to be done, it sends
 * a Ftp request to the host corresponding to the {@link URL} passed, gets
 * robots file, parses the rules and caches the rules object to avoid re-work
 * in future.
 *
 * @param ftp
 *          The {@link Protocol} object
 * @param url
 *          URL
 * @param robotsTxtContent
 *          container to store responses when fetching the robots.txt file for
 *          debugging or archival purposes. Instead of a robots.txt file, it
 *          may include redirects or an error page (404, etc.). Response
 *          {@link Content} is appended to the passed list. If null is passed
 *          nothing is stored.
 *
 * @return robotRules A {@link BaseRobotRules} object for the rules
 */
@Override
public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url, List<Content> robotsTxtContent) {
    // normalize to lower
    String protocol = url.getProtocol().toLowerCase();
    // case
    // normalize to lower case
    String host = url.getHost().toLowerCase();
    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
        LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
    }
    BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
    if (robotRules != null) {
        // cached rule
        return robotRules;
    } else if (LOG.isTraceEnabled()) {
        LOG.trace("cache miss " + url);
    }
    boolean cacheRule = true;
    if (isWhiteListed(url)) {
        // check in advance whether a host is whitelisted
        // (we do not need to fetch robots.txt)
        robotRules = EMPTY_RULES;
        LOG.info("Whitelisted host found for: {}", url);
        LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
    } else {
        try {
            Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
            ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, new CrawlDatum());
            ProtocolStatus status = output.getStatus();
            if (robotsTxtContent != null) {
                robotsTxtContent.add(output.getContent());
            }
            if (status.getCode() == ProtocolStatus.SUCCESS) {
                robotRules = parseRules(url.toString(), output.getContent().getContent(), CONTENT_TYPE, agentNames);
            } else {
                // use default rules
                robotRules = EMPTY_RULES;
            }
        } catch (Throwable t) {
            if (LOG.isInfoEnabled()) {
                LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
            }
            // try again later to fetch robots.txt
            cacheRule = false;
            robotRules = EMPTY_RULES;
        }
    }
    if (cacheRule)
        // cache rules for host
        CACHE.put(protocol + ":" + host, robotRules);
    return robotRules;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) BaseRobotRules(crawlercommons.robots.BaseRobotRules) URL(java.net.URL) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 3 with BaseRobotRules

use of crawlercommons.robots.BaseRobotRules in project nutch by apache.

the class FetcherThread method run.

@SuppressWarnings("fallthrough")
public void run() {
    // count threads
    activeThreads.incrementAndGet();
    Text url = new Text();
    FetchItem fit = null;
    try {
        // checking for the server to be running and fetcher.parse to be true
        if (parsing && NutchServer.getInstance().isRunning())
            reportToNutchServer = true;
        while (true) {
            // creating FetchNode for storing in FetchNodeDb
            if (reportToNutchServer)
                this.fetchNode = new FetchNode();
            else
                this.fetchNode = null;
            // check whether must be stopped
            if (isHalted()) {
                LOG.debug(getName() + " set to halted");
                fit = null;
                return;
            }
            fit = ((FetchItemQueues) fetchQueues).getFetchItem();
            if (fit != null) {
                URL u = fit.u;
                String temp_url = u.getProtocol() + "://" + u.getHost() + ":" + u.getPort() + u.getFile();
                url = new Text(temp_url);
            }
            if (fit == null) {
                if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
                    LOG.debug(getName() + " spin-waiting ...");
                    // spin-wait.
                    ((AtomicInteger) spinWaiting).incrementAndGet();
                    try {
                        Thread.sleep(500);
                    } catch (Exception e) {
                    }
                    ((AtomicInteger) spinWaiting).decrementAndGet();
                    continue;
                } else {
                    // all done, finish this thread
                    LOG.info(getName() + " " + Thread.currentThread().getId() + " has no more work available");
                    return;
                }
            }
            lastRequestStart.set(System.currentTimeMillis());
            Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
            if (reprUrlWritable == null) {
                setReprUrl(url.toString());
            } else {
                setReprUrl(reprUrlWritable.toString());
            }
            try {
                // fetch the page
                redirecting = false;
                redirectCount = 0;
                // Publisher event
                if (activatePublisher) {
                    FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
                    publisher.publish(startEvent, conf);
                }
                do {
                    if (LOG.isInfoEnabled()) {
                        LOG.info(getName() + " " + Thread.currentThread().getId() + " fetching " + url + " (queue crawl delay=" + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay + "ms)");
                    }
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("redirectCount=" + redirectCount);
                    }
                    redirecting = false;
                    Protocol protocol = this.protocolFactory.getProtocol(url.toString());
                    BaseRobotRules rules = protocol.getRobotRules(url, fit.datum, robotsTxtContent);
                    if (robotsTxtContent != null) {
                        outputRobotsTxt(robotsTxtContent);
                        robotsTxtContent.clear();
                    }
                    if (!rules.isAllowed(fit.u.toString())) {
                        // unblock
                        ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Denied by robots.txt: " + url);
                        }
                        output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                        context.getCounter("FetcherStatus", "robots_denied").increment(1);
                        continue;
                    }
                    if (rules.getCrawlDelay() > 0) {
                        if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                            // unblock
                            ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                            LOG.debug("Crawl-Delay for " + url + " too long (" + rules.getCrawlDelay() + "), skipping");
                            output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                            context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
                            continue;
                        } else {
                            FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
                            fiq.crawlDelay = rules.getCrawlDelay();
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + url);
                            }
                        }
                    }
                    ProtocolOutput output = protocol.getProtocolOutput(url, fit.datum);
                    ProtocolStatus status = output.getStatus();
                    Content content = output.getContent();
                    ParseStatus pstatus = null;
                    // unblock queue
                    ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                    String urlString = url.toString();
                    // used for FetchNode
                    if (fetchNode != null) {
                        fetchNode.setStatus(status.getCode());
                        fetchNode.setFetchTime(System.currentTimeMillis());
                        fetchNode.setUrl(url);
                    }
                    // Publish fetch finish event
                    if (activatePublisher) {
                        FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
                        endEvent.addEventData("status", status.getName());
                        publisher.publish(endEvent, conf);
                    }
                    context.getCounter("FetcherStatus", status.getName()).increment(1);
                    switch(status.getCode()) {
                        case ProtocolStatus.WOULDBLOCK:
                            // retry ?
                            ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                            break;
                        case // got a page
                        ProtocolStatus.SUCCESS:
                            pstatus = output(url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
                            updateStatus(content.getContent().length);
                            if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                                String newUrl = pstatus.getMessage();
                                int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                                Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
                                if (redirUrl != null) {
                                    fit = queueRedirect(redirUrl, fit);
                                }
                            }
                            break;
                        // redirect
                        case ProtocolStatus.MOVED:
                        case ProtocolStatus.TEMP_MOVED:
                            int code;
                            boolean temp;
                            if (status.getCode() == ProtocolStatus.MOVED) {
                                code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                                temp = false;
                            } else {
                                code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                                temp = true;
                            }
                            output(url, fit.datum, content, status, code);
                            String newUrl = status.getMessage();
                            Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR);
                            if (redirUrl != null) {
                                fit = queueRedirect(redirUrl, fit);
                            } else {
                                // stop redirecting
                                redirecting = false;
                            }
                            break;
                        case ProtocolStatus.EXCEPTION:
                            logError(url, status.getMessage());
                            int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID());
                            if (killedURLs != 0)
                                context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
                        // retry
                        case ProtocolStatus.RETRY:
                        case ProtocolStatus.BLOCKED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                            break;
                        // gone
                        case ProtocolStatus.GONE:
                        case ProtocolStatus.NOTFOUND:
                        case ProtocolStatus.ACCESS_DENIED:
                        case ProtocolStatus.ROBOTS_DENIED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
                            break;
                        case ProtocolStatus.NOTMODIFIED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
                            break;
                        default:
                            if (LOG.isWarnEnabled()) {
                                LOG.warn(getName() + " " + Thread.currentThread().getId() + " Unknown ProtocolStatus: " + status.getCode());
                            }
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                    }
                    if (redirecting && redirectCount > maxRedirect) {
                        ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                        if (LOG.isInfoEnabled()) {
                            LOG.info(getName() + " " + Thread.currentThread().getId() + "  - redirect count exceeded " + url);
                        }
                        output(url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE);
                    }
                } while (redirecting && (redirectCount <= maxRedirect));
            } catch (Throwable t) {
                // unexpected exception
                // unblock
                ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                logError(url, StringUtils.stringifyException(t));
                output(url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
            }
        }
    } catch (Throwable e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:" + e.toString());
        }
    } finally {
        if (fit != null)
            ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
        // count threads
        activeThreads.decrementAndGet();
        LOG.info(getName() + " " + Thread.currentThread().getId() + " -finishing thread " + getName() + ", activeThreads=" + activeThreads);
    }
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) URL(java.net.URL) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseStatus(org.apache.nutch.parse.ParseStatus) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) BaseRobotRules(crawlercommons.robots.BaseRobotRules) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 4 with BaseRobotRules

use of crawlercommons.robots.BaseRobotRules in project nutch by apache.

the class RobotRulesParser method run.

@Override
public int run(String[] args) {
    if (args.length < 2) {
        String[] help = { "Usage: RobotRulesParser [ -Dproperty=... ] <robots-file-or-url> <url-file> [<agent-names>]", "", "<robots-file-or-url>\tlocal file or URL parsed as robots.txt file", "\tIf <robots-file-or-url> starts with a protocol specification", "\t(`http', `https', `ftp' or `file'), robots.txt it is fetched", "\tusing the specified protocol. Otherwise, a local file is assumed.", "", "<url-file>\tlocal file with URLs (one per line), for every URL", "\tthe path part (including the query) is checked whether", "\tit is allowed by the robots.txt rules.  Other parts of the URLs", "\t(mainly the host) are ignored.", "", "<agent-names>\tcomma-separated list of agent names", "\tused to select rules from the robots.txt file.", "\tIf no agent name is given the property http.agent.name is used.", "\tIf http.agent.name is empty, robots.txt is checked for rules", "\tassigned to the user agent `*' (meaning any other).", "", "Important properties:", " -D fetcher.store.robotstxt=true", "\toutput content and HTTP meta data of fetched robots.txt (if not a local file)", " -D http.agent.name=...\tsame as argument <agent-names>", " -D http.robots.agents=...\tadditional agent names", " -D http.robot.rules.whitelist=..." };
        for (String s : help) {
            System.err.println(s);
        }
        return -1;
    }
    Protocol protocol = null;
    URL robotsTxtUrl = null;
    if (args[0].matches("^(?:https?|ftp|file)://?.*")) {
        try {
            robotsTxtUrl = new URL(args[0]);
        } catch (MalformedURLException e) {
            LOG.warn("Not a valid URL, assuming local file: {}", args[0]);
        }
        ProtocolFactory factory = new ProtocolFactory(conf);
        try {
            protocol = factory.getProtocol(robotsTxtUrl.toString());
        } catch (ProtocolNotFound e) {
            LOG.error("No protocol found for {}: {}", args[0], StringUtils.stringifyException(e));
            return -1;
        }
    }
    if (robotsTxtUrl == null) {
        // try as local file
        File robotsFile = new File(args[0]);
        if (!robotsFile.exists()) {
            LOG.error("File does not exist: {}", args[0]);
            return -1;
        } else {
            try {
                robotsTxtUrl = robotsFile.toURI().toURL();
            } catch (MalformedURLException e) {
            }
        }
    }
    File urlFile = new File(args[1]);
    if (args.length > 2) {
        // set agent name from command-line in configuration and update parser
        String agents = args[2];
        conf.set("http.agent.name", agents);
        setConf(conf);
    }
    List<Content> robotsTxtContent = null;
    if (getConf().getBoolean("fetcher.store.robotstxt", false)) {
        robotsTxtContent = new LinkedList<>();
    }
    try {
        BaseRobotRules rules = getRobotRulesSet(protocol, robotsTxtUrl, robotsTxtContent);
        if (robotsTxtContent != null) {
            for (Content robotsTxt : robotsTxtContent) {
                LOG.info("fetched robots.txt {}:", robotsTxt.getUrl());
                LOG.info(robotsTxt.toString());
            }
        }
        System.out.println("Testing robots.txt for agent names: " + agentNames);
        LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
        String testPath;
        testPath = testsIn.readLine();
        while (testPath != null) {
            testPath = testPath.trim();
            try {
                // testPath can be just a path or a complete URL
                URL url = new URL(testPath);
                String status;
                if (isWhiteListed(url)) {
                    status = "whitelisted";
                } else if (rules.isAllowed(testPath)) {
                    status = "allowed";
                } else {
                    status = "not allowed";
                }
                System.out.println(status + ":\t" + testPath);
            } catch (MalformedURLException e) {
                LOG.warn("Not a valid URL: {}", testPath);
            }
            testPath = testsIn.readLine();
        }
        testsIn.close();
    } catch (IOException e) {
        LOG.error("Failed to run: " + StringUtils.stringifyException(e));
        return -1;
    }
    return 0;
}
Also used : MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) URL(java.net.URL) LineNumberReader(java.io.LineNumberReader) FileReader(java.io.FileReader) File(java.io.File) BaseRobotRules(crawlercommons.robots.BaseRobotRules)

Aggregations

BaseRobotRules (crawlercommons.robots.BaseRobotRules)4 URL (java.net.URL)4 IOException (java.io.IOException)2 MalformedURLException (java.net.MalformedURLException)2 Text (org.apache.hadoop.io.Text)2 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)2 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)2 ProtocolStatus (org.apache.nutch.protocol.ProtocolStatus)2 File (java.io.File)1 FileReader (java.io.FileReader)1 LineNumberReader (java.io.LineNumberReader)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 URLFilterException (org.apache.nutch.net.URLFilterException)1 Response (org.apache.nutch.net.protocols.Response)1 ParseStatus (org.apache.nutch.parse.ParseStatus)1 ParseText (org.apache.nutch.parse.ParseText)1 Content (org.apache.nutch.protocol.Content)1 Protocol (org.apache.nutch.protocol.Protocol)1 ScoringFilterException (org.apache.nutch.scoring.ScoringFilterException)1