Search in sources :

Example 1 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class TestProtocolHttp method fetchPage.

 * Fetches the specified <code>page</code> from the local Jetty server and
 * checks whether the HTTP response status code matches with the expected
 * code. Also use jsp pages for redirection.
 * @param page
 *          Page to be fetched.
 * @param expectedCode
 *          HTTP response status code expected while fetching the page.
private void fetchPage(String page, int expectedCode) throws Exception {
    URL url = new URL("http", "", port, page);
    CrawlDatum crawlDatum = new CrawlDatum();
    Response response = http.getResponse(url, crawlDatum, true);
    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), crawlDatum);
    Content content = out.getContent();
    assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
    if (page.compareTo("/nonexists.html") != 0 && page.compareTo("/brokenpage.jsp") != 0 && page.compareTo("/redirection") != 0) {
        assertEquals("ContentType " + url, "text/html", content.getContentType());
Also used : Response( ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text( URL(

Example 2 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class File method main.

 * Quick way for running this class. Useful for debugging.
public static void main(String[] args) throws Exception {
    int maxContentLength = Integer.MIN_VALUE;
    boolean dumpContent = false;
    String urlString = null;
    String usage = "Usage: File [-maxContentLength L] [-dumpContent] url";
    if (args.length == 0) {
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-maxContentLength")) {
            maxContentLength = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-dumpContent")) {
            dumpContent = true;
        } else if (i != args.length - 1) {
        } else
            urlString = args[i];
    File file = new File();
    if (// set maxContentLength
    maxContentLength != Integer.MIN_VALUE)
    // set log level
    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
    ProtocolOutput output = file.getProtocolOutput(new Text(urlString), new CrawlDatum());
    Content content = output.getContent();
    System.err.println("URL: " + content.getUrl());
    System.err.println("Status: " + output.getStatus());
    System.err.println("Content-Type: " + content.getContentType());
    System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
    System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED));
    String redirectLocation = content.getMetadata().get("Location");
    if (redirectLocation != null) {
        System.err.println("Location: " + redirectLocation);
    if (dumpContent) {
        System.out.print(new String(content.getContent()));
    file = null;
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(

Example 3 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class FtpRobotRulesParser method getRobotRulesSet.

 * The hosts for which the caching of robots rules is yet to be done, it sends
 * a Ftp request to the host corresponding to the {@link URL} passed, gets
 * robots file, parses the rules and caches the rules object to avoid re-work
 * in future.
 * @param ftp
 *          The {@link Protocol} object
 * @param url
 *          URL
 * @param robotsTxtContent
 *          container to store responses when fetching the robots.txt file for
 *          debugging or archival purposes. Instead of a robots.txt file, it
 *          may include redirects or an error page (404, etc.). Response
 *          {@link Content} is appended to the passed list. If null is passed
 *          nothing is stored.
 * @return robotRules A {@link BaseRobotRules} object for the rules
public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url, List<Content> robotsTxtContent) {
    // normalize to lower
    String protocol = url.getProtocol().toLowerCase();
    // case
    // normalize to lower case
    String host = url.getHost().toLowerCase();
    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
        LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
    BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
    if (robotRules != null) {
        // cached rule
        return robotRules;
    } else if (LOG.isTraceEnabled()) {
        LOG.trace("cache miss " + url);
    boolean cacheRule = true;
    if (isWhiteListed(url)) {
        // check in advance whether a host is whitelisted
        // (we do not need to fetch robots.txt)
        robotRules = EMPTY_RULES;"Whitelisted host found for: {}", url);"Ignoring robots.txt for all URLs from whitelisted host: {}", host);
    } else {
        try {
            Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
            ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, new CrawlDatum());
            ProtocolStatus status = output.getStatus();
            if (robotsTxtContent != null) {
            if (status.getCode() == ProtocolStatus.SUCCESS) {
                robotRules = parseRules(url.toString(), output.getContent().getContent(), CONTENT_TYPE, agentNames);
            } else {
                // use default rules
                robotRules = EMPTY_RULES;
        } catch (Throwable t) {
            if (LOG.isInfoEnabled()) {
      "Couldn't get robots.txt for " + url + ": " + t.toString());
            // try again later to fetch robots.txt
            cacheRule = false;
            robotRules = EMPTY_RULES;
    if (cacheRule)
        // cache rules for host
        CACHE.put(protocol + ":" + host, robotRules);
    return robotRules;
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text( BaseRobotRules(crawlercommons.robots.BaseRobotRules) URL( ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 4 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class FetcherThread method run.

public void run() {
    // count threads
    Text url = new Text();
    FetchItem fit = null;
    try {
        // checking for the server to be running and fetcher.parse to be true
        if (parsing && NutchServer.getInstance().isRunning())
            reportToNutchServer = true;
        while (true) {
            // creating FetchNode for storing in FetchNodeDb
            if (reportToNutchServer)
                this.fetchNode = new FetchNode();
                this.fetchNode = null;
            // check whether must be stopped
            if (isHalted()) {
                LOG.debug(getName() + " set to halted");
                fit = null;
            fit = ((FetchItemQueues) fetchQueues).getFetchItem();
            if (fit != null) {
                URL u = fit.u;
                String temp_url = u.getProtocol() + "://" + u.getHost() + ":" + u.getPort() + u.getFile();
                url = new Text(temp_url);
            if (fit == null) {
                if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
                    LOG.debug(getName() + " spin-waiting ...");
                    // spin-wait.
                    ((AtomicInteger) spinWaiting).incrementAndGet();
                    try {
                    } catch (Exception e) {
                    ((AtomicInteger) spinWaiting).decrementAndGet();
                } else {
                    // all done, finish this thread
           + " " + Thread.currentThread().getId() + " has no more work available");
            Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
            if (reprUrlWritable == null) {
            } else {
            try {
                // fetch the page
                redirecting = false;
                redirectCount = 0;
                // Publisher event
                if (activatePublisher) {
                    FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
                    publisher.publish(startEvent, conf);
                do {
                    if (LOG.isInfoEnabled()) {
               + " " + Thread.currentThread().getId() + " fetching " + url + " (queue crawl delay=" + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay + "ms)");
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("redirectCount=" + redirectCount);
                    redirecting = false;
                    Protocol protocol = this.protocolFactory.getProtocol(url.toString());
                    BaseRobotRules rules = protocol.getRobotRules(url, fit.datum, robotsTxtContent);
                    if (robotsTxtContent != null) {
                    if (!rules.isAllowed(fit.u.toString())) {
                        // unblock
                        ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Denied by robots.txt: " + url);
                        output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                        context.getCounter("FetcherStatus", "robots_denied").increment(1);
                    if (rules.getCrawlDelay() > 0) {
                        if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                            // unblock
                            ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                            LOG.debug("Crawl-Delay for " + url + " too long (" + rules.getCrawlDelay() + "), skipping");
                            output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                            context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
                        } else {
                            FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
                            fiq.crawlDelay = rules.getCrawlDelay();
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + url);
                    ProtocolOutput output = protocol.getProtocolOutput(url, fit.datum);
                    ProtocolStatus status = output.getStatus();
                    Content content = output.getContent();
                    ParseStatus pstatus = null;
                    // unblock queue
                    ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                    String urlString = url.toString();
                    // used for FetchNode
                    if (fetchNode != null) {
                    // Publish fetch finish event
                    if (activatePublisher) {
                        FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
                        endEvent.addEventData("status", status.getName());
                        publisher.publish(endEvent, conf);
                    context.getCounter("FetcherStatus", status.getName()).increment(1);
                    switch(status.getCode()) {
                        case ProtocolStatus.WOULDBLOCK:
                            // retry ?
                            ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                        case // got a page
                            pstatus = output(url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
                            if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                                String newUrl = pstatus.getMessage();
                                int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                                Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
                                if (redirUrl != null) {
                                    fit = queueRedirect(redirUrl, fit);
                        // redirect
                        case ProtocolStatus.MOVED:
                        case ProtocolStatus.TEMP_MOVED:
                            int code;
                            boolean temp;
                            if (status.getCode() == ProtocolStatus.MOVED) {
                                code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                                temp = false;
                            } else {
                                code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                                temp = true;
                            output(url, fit.datum, content, status, code);
                            String newUrl = status.getMessage();
                            Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR);
                            if (redirUrl != null) {
                                fit = queueRedirect(redirUrl, fit);
                            } else {
                                // stop redirecting
                                redirecting = false;
                        case ProtocolStatus.EXCEPTION:
                            logError(url, status.getMessage());
                            int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID());
                            if (killedURLs != 0)
                                context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
                        // retry
                        case ProtocolStatus.RETRY:
                        case ProtocolStatus.BLOCKED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                        // gone
                        case ProtocolStatus.GONE:
                        case ProtocolStatus.NOTFOUND:
                        case ProtocolStatus.ACCESS_DENIED:
                        case ProtocolStatus.ROBOTS_DENIED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
                        case ProtocolStatus.NOTMODIFIED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
                            if (LOG.isWarnEnabled()) {
                                LOG.warn(getName() + " " + Thread.currentThread().getId() + " Unknown ProtocolStatus: " + status.getCode());
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                    if (redirecting && redirectCount > maxRedirect) {
                        ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                        if (LOG.isInfoEnabled()) {
                   + " " + Thread.currentThread().getId() + "  - redirect count exceeded " + url);
                        output(url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE);
                } while (redirecting && (redirectCount <= maxRedirect));
            } catch (Throwable t) {
                // unexpected exception
                // unblock
                ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                logError(url, StringUtils.stringifyException(t));
                output(url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
    } catch (Throwable e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:" + e.toString());
    } finally {
        if (fit != null)
            ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
        // count threads
        activeThreads.decrementAndGet(); + " " + Thread.currentThread().getId() + " -finishing thread " + getName() + ", activeThreads=" + activeThreads);
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Text( ParseText(org.apache.nutch.parse.ParseText) URL( ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException( MalformedURLException( IOException( ParseStatus(org.apache.nutch.parse.ParseStatus) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) BaseRobotRules(crawlercommons.robots.BaseRobotRules) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 5 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class ParserChecker method run.

public int run(String[] args) throws Exception {
    boolean dumpText = false;
    boolean force = false;
    String contentType = null;
    String url = null;
    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
    if (args.length == 0) {
        return (-1);
    // used to simulate the metadata propagated from injection
    HashMap<String, String> metadata = new HashMap<>();
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-forceAs")) {
            force = true;
            contentType = args[++i];
        } else if (args[i].equals("-dumpText")) {
            dumpText = true;
        } else if (args[i].equals("-md")) {
            String k = null, v = null;
            String nextOne = args[++i];
            int firstEquals = nextOne.indexOf("=");
            if (firstEquals != -1) {
                k = nextOne.substring(0, firstEquals);
                v = nextOne.substring(firstEquals + 1);
            } else
                k = nextOne;
            metadata.put(k, v);
        } else if (i != args.length - 1) {
        } else {
            url = URLUtil.toASCII(args[i]);
    if (LOG.isInfoEnabled()) {"fetching: " + url);
    CrawlDatum cd = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key =;
        String value = metadata.get(key);
        if (value == null)
            value = "";
        cd.getMetaData().put(new Text(key), new Text(value));
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Text turl = new Text(url);
    ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
    // if the configuration permits, handle redirects until we either run
    // out of allowed redirects or we stop getting redirect statuses.
    int maxRedirects = conf.getInt("http.redirect.max", 0);
    int numRedirects = 0;
    while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
        String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);"Handling redirect to " + newURL);
        protocol = factory.getProtocol(newURL);
        turl = new Text(newURL);
        output = protocol.getProtocolOutput(turl, cd);
    if (!output.getStatus().isSuccess()) {
        System.err.println("Fetch failed with protocol status: " + output.getStatus());
        if (output.getStatus().isRedirect()) {
            System.err.println("Redirect(s) not handled due to configuration.");
            System.err.println("Max Redirects to handle per config: " + maxRedirects);
            System.err.println("Number of Redirects handled: " + numRedirects);
        return (-1);
    Content content = output.getContent();
    if (content == null) {
        LOG.error("No content for " + url);
        return (-1);
    if (force) {
    } else {
        contentType = content.getContentType();
    if (contentType == null) {
        LOG.error("Failed to determine content type!");
        return (-1);
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    ScoringFilters scfilters = new ScoringFilters(conf);
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, cd, content);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")");
    ParseResult parseResult = new ParseUtil(conf).parse(content);
    if (parseResult == null) {
        LOG.error("Parsing content failed!");
        return (-1);
    // calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new Text(url)));
    if (LOG.isInfoEnabled()) {"parsing: " + url);"contentType: " + contentType);"signature: " + StringUtil.toHexString(signature));
    Parse parse = parseResult.get(turl);
    if (parse == null) {
        LOG.error("Failed to get parse from parse result");
        LOG.error("Available parses in parse result (by URL key):");
        for (Map.Entry<Text, Parse> entry : parseResult) {
            LOG.error("  " + entry.getKey());
        LOG.error("Parse result does not contain a parse for URL to be checked:");
        LOG.error("  " + turl);
        return -1;
    // call the scoring filters
    try {
        scfilters.passScoreAfterParsing(turl, content, parse);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")");
    for (Map.Entry<Text, Parse> entry : parseResult) {
        parse = entry.getValue();"---------\nUrl\n---------------\n");
        if (dumpText) {
    return 0;
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) HashMap(java.util.HashMap) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text( ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) Protocol(org.apache.nutch.protocol.Protocol) HashMap(java.util.HashMap) Map(java.util.Map)


ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)12 Text ( Content (org.apache.nutch.protocol.Content)7 URL ( CrawlDatum (org.apache.nutch.crawl.CrawlDatum)6 ProtocolStatus (org.apache.nutch.protocol.ProtocolStatus)5 Protocol (org.apache.nutch.protocol.Protocol)4 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)3 BaseRobotRules (crawlercommons.robots.BaseRobotRules)2 IOException ( MalformedURLException ( HashMap (java.util.HashMap)2 Map (java.util.Map)2 Response ( ScoringFilters (org.apache.nutch.scoring.ScoringFilters)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 IntWritable ( Inlinks (org.apache.nutch.crawl.Inlinks)1 URLFilterException ( Parse (org.apache.nutch.parse.Parse)1