Examples with ProtocolStatus - org.apache.nutch.protocol.ProtocolStatus

Example 1 with ProtocolStatus

use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.

the class CrawlDatum method evaluate.

public boolean evaluate(Expression expr, String url) {
    if (expr != null && url != null) {
        // Create a context and add data
        JexlContext jcontext = new MapContext();
        // https://issues.apache.org/jira/browse/NUTCH-2229
        jcontext.set("url", url);
        jcontext.set("status", getStatusName(getStatus()));
        jcontext.set("fetchTime", (long) (getFetchTime()));
        jcontext.set("modifiedTime", (long) (getModifiedTime()));
        jcontext.set("retries", getRetriesSinceFetch());
        jcontext.set("interval", new Integer(getFetchInterval()));
        jcontext.set("score", getScore());
        jcontext.set("signature", StringUtil.toHexString(getSignature()));
        // Set metadata variables
        for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
            Object value = entry.getValue();
            Text tkey = (Text) entry.getKey();
            if (value instanceof FloatWritable) {
                FloatWritable fvalue = (FloatWritable) value;
                jcontext.set(tkey.toString(), fvalue.get());
            }
            if (value instanceof IntWritable) {
                IntWritable ivalue = (IntWritable) value;
                jcontext.set(tkey.toString(), ivalue.get());
            }
            if (value instanceof Text) {
                Text tvalue = (Text) value;
                jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
            }
            if (value instanceof ProtocolStatus) {
                ProtocolStatus pvalue = (ProtocolStatus) value;
                jcontext.set(tkey.toString().replace("-", "_"), pvalue.toString());
            }
        }
        try {
            if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
                return true;
            }
        } catch (Exception e) {
        // 
        }
    }
    return false;
}

Also used : FloatWritable(org.apache.hadoop.io.FloatWritable) JexlContext(org.apache.commons.jexl2.JexlContext) Writable(org.apache.hadoop.io.Writable) FloatWritable(org.apache.hadoop.io.FloatWritable) IntWritable(org.apache.hadoop.io.IntWritable) Text(org.apache.hadoop.io.Text) MapContext(org.apache.commons.jexl2.MapContext) HashMap(java.util.HashMap) Map(java.util.Map) IntWritable(org.apache.hadoop.io.IntWritable) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus) IOException(java.io.IOException) VersionMismatchException(org.apache.hadoop.io.VersionMismatchException)

Example 2 with ProtocolStatus

use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.

the class UpdateHostDbMapper method map.

/**
 * Mapper ingesting records from the HostDB, CrawlDB and plaintext host
 * scores file. Statistics and scores are passed on.
 *
 * @param key record {@link org.apache.hadoop.io.Text} key
 * @param value associated {@link org.apache.hadoop.io.Writable} object
 * @param context {@link org.apache.hadoop.mapreduce.Reducer.Context} for
 * writing custom counters and output.
 */
@Override
public void map(Text key, Writable value, Context context) throws IOException, InterruptedException {
    // Get the key!
    String keyStr = key.toString();
    // Check if we process records from the CrawlDB
    if (key instanceof Text && value instanceof CrawlDatum) {
        // Get the normalized and filtered host of this URL
        buffer = filterNormalize(URLUtil.getHost(keyStr));
        // Filtered out?
        if (buffer == null) {
            context.getCounter("UpdateHostDb", "filtered_records").increment(1);
            LOG.info("UpdateHostDb: " + URLUtil.getHost(keyStr) + " crawldatum has been filtered");
            return;
        }
        // Set the host of this URL
        host.set(buffer);
        crawlDatum = (CrawlDatum) value;
        hostDatum = new HostDatum();
        // Do not resolve homepages when the root URL is unfetched
        if (crawlDatum.getStatus() != CrawlDatum.STATUS_DB_UNFETCHED) {
            // Get the protocol
            String protocol = URLUtil.getProtocol(keyStr);
            // Get the proposed homepage URL
            String homepage = protocol + "://" + buffer + "/";
            // Check if the current key is equals the host
            if (keyStr.equals(homepage)) {
                // Check if this is a redirect to the real home page
                if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
                    // Obtain the repr url for this redirect via protocolstatus from the metadata
                    ProtocolStatus z = (ProtocolStatus) crawlDatum.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
                    // Get the protocol status' arguments
                    args = z.getArgs();
                    // ..and the possible redirect URL
                    reprUrl = args[0];
                    // Am i a redirect?
                    if (reprUrl != null) {
                        LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0]);
                        context.write(host, new NutchWritable(hostDatum));
                        hostDatum.setHomepageUrl(reprUrl);
                    } else {
                        LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0] + " but has been filtered out");
                    }
                } else {
                    hostDatum.setHomepageUrl(homepage);
                    context.write(host, new NutchWritable(hostDatum));
                    LOG.info("UpdateHostDb: homepage: " + homepage);
                }
            }
        }
        // Always emit crawl datum
        context.write(host, new NutchWritable(crawlDatum));
    }
    // Check if we got a record from the hostdb
    if (key instanceof Text && value instanceof HostDatum) {
        buffer = filterNormalize(keyStr);
        // Filtered out?
        if (buffer == null) {
            context.getCounter("UpdateHostDb", "filtered_records").increment(1);
            LOG.info("UpdateHostDb: {} hostdatum has been filtered", keyStr);
            return;
        }
        // Get a HostDatum
        hostDatum = (HostDatum) value;
        key.set(buffer);
        // we're aggregating them from CrawlDB anyway
        if (readingCrawlDb) {
            hostDatum.resetStatistics();
        }
        context.write(key, new NutchWritable(hostDatum));
    }
    // Check if we got a record with host scores
    if (key instanceof Text && value instanceof Text) {
        buffer = filterNormalize(keyStr);
        // Filtered out?
        if (buffer == null) {
            context.getCounter("UpdateHostDb", "filtered_records").increment(1);
            LOG.info("UpdateHostDb: {} score has been filtered", keyStr);
            return;
        }
        key.set(buffer);
        context.write(key, new NutchWritable(new FloatWritable(Float.parseFloat(value.toString()))));
    }
}

Also used : FloatWritable(org.apache.hadoop.io.FloatWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) NutchWritable(org.apache.nutch.crawl.NutchWritable) Text(org.apache.hadoop.io.Text) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 3 with ProtocolStatus

use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.

the class FetcherThread method run.

@SuppressWarnings("fallthrough")
public void run() {
    // count threads
    activeThreads.incrementAndGet();
    FetchItem fit = null;
    try {
        // checking for the server to be running and fetcher.parse to be true
        if (parsing && NutchServer.getInstance().isRunning())
            reportToNutchServer = true;
        while (true) {
            // creating FetchNode for storing in FetchNodeDb
            if (reportToNutchServer)
                this.fetchNode = new FetchNode();
            else
                this.fetchNode = null;
            // check whether must be stopped
            if (isHalted()) {
                LOG.debug("{} set to halted", getName());
                fit = null;
                return;
            }
            fit = fetchQueues.getFetchItem();
            if (fit == null) {
                if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) {
                    LOG.debug("{} spin-waiting ...", getName());
                    // spin-wait.
                    spinWaiting.incrementAndGet();
                    try {
                        Thread.sleep(500);
                    } catch (Exception e) {
                    }
                    spinWaiting.decrementAndGet();
                    continue;
                } else {
                    // all done, finish this thread
                    LOG.info("{} {} has no more work available", getName(), Thread.currentThread().getId());
                    return;
                }
            }
            lastRequestStart.set(System.currentTimeMillis());
            Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
            if (reprUrlWritable == null) {
                setReprUrl(fit.url.toString());
            } else {
                setReprUrl(reprUrlWritable.toString());
            }
            try {
                // fetch the page
                redirecting = false;
                redirectCount = 0;
                // Publisher event
                if (activatePublisher) {
                    FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
                    publisher.publish(startEvent, conf);
                }
                do {
                    if (LOG.isInfoEnabled()) {
                        LOG.info("{} {} fetching {} (queue crawl delay={}ms)", getName(), Thread.currentThread().getId(), fit.url, fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay);
                    }
                    LOG.debug("redirectCount={}", redirectCount);
                    redirecting = false;
                    Protocol protocol = this.protocolFactory.getProtocol(fit.u);
                    BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, robotsTxtContent);
                    if (robotsTxtContent != null) {
                        outputRobotsTxt(robotsTxtContent);
                        robotsTxtContent.clear();
                    }
                    if (rules.isDeferVisits()) {
                        LOG.info("Defer visits for queue {} : {}", fit.queueID, fit.url);
                        // retry the fetch item
                        if (fetchQueues.timelimitExceeded()) {
                            fetchQueues.finishFetchItem(fit, true);
                        } else {
                            fetchQueues.addFetchItem(fit);
                        }
                        // but check whether it's time to cancel the queue
                        int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID(), this.robotsDeferVisitsRetries + 1, this.robotsDeferVisitsDelay);
                        if (killedURLs != 0) {
                            context.getCounter("FetcherStatus", "robots_defer_visits_dropped").increment(killedURLs);
                        }
                        continue;
                    }
                    if (!rules.isAllowed(fit.url.toString())) {
                        // unblock
                        fetchQueues.finishFetchItem(fit, true);
                        LOG.info("Denied by robots.txt: {}", fit.url);
                        output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                        context.getCounter("FetcherStatus", "robots_denied").increment(1);
                        continue;
                    }
                    if (rules.getCrawlDelay() > 0) {
                        if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                            // unblock
                            fetchQueues.finishFetchItem(fit, true);
                            LOG.info("Crawl-Delay for {} too long ({} ms), skipping", fit.url, rules.getCrawlDelay());
                            output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                            context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
                            continue;
                        } else {
                            FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
                            long crawlDelay = rules.getCrawlDelay();
                            if (crawlDelay < minCrawlDelay) {
                                LOG.info("Crawl-Delay for {} too short ({} ms), adjusting to {} ms", fit.url, rules.getCrawlDelay(), minCrawlDelay);
                                crawlDelay = minCrawlDelay;
                            }
                            fiq.crawlDelay = crawlDelay;
                            LOG.debug("Crawl delay for queue: {} is set to {} as per robots.txt. url: ", fit.queueID, fiq.crawlDelay, fit.url);
                        }
                    }
                    ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);
                    ProtocolStatus status = output.getStatus();
                    Content content = output.getContent();
                    ParseStatus pstatus = null;
                    // unblock queue
                    fetchQueues.finishFetchItem(fit);
                    // used for FetchNode
                    if (fetchNode != null) {
                        fetchNode.setStatus(status.getCode());
                        fetchNode.setFetchTime(System.currentTimeMillis());
                        fetchNode.setUrl(fit.url);
                    }
                    // Publish fetch finish event
                    if (activatePublisher) {
                        FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
                        endEvent.addEventData("status", status.getName());
                        publisher.publish(endEvent, conf);
                    }
                    context.getCounter("FetcherStatus", status.getName()).increment(1);
                    switch(status.getCode()) {
                        case ProtocolStatus.WOULDBLOCK:
                            // retry ?
                            fetchQueues.addFetchItem(fit);
                            break;
                        case // got a page
                        ProtocolStatus.SUCCESS:
                            pstatus = output(fit.url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
                            updateStatus(content.getContent().length);
                            if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                                String newUrl = pstatus.getMessage();
                                int refreshTime = Integer.parseInt(pstatus.getArgs()[1]);
                                Text redirUrl = handleRedirect(fit, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
                                if (redirUrl != null) {
                                    fit = queueRedirect(redirUrl, fit);
                                }
                            }
                            break;
                        // redirect
                        case ProtocolStatus.MOVED:
                        case ProtocolStatus.TEMP_MOVED:
                            int code;
                            boolean temp;
                            if (status.getCode() == ProtocolStatus.MOVED) {
                                code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                                temp = false;
                            } else {
                                code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                                temp = true;
                            }
                            output(fit.url, fit.datum, content, status, code);
                            String newUrl = status.getMessage();
                            Text redirUrl = handleRedirect(fit, newUrl, temp, Fetcher.PROTOCOL_REDIR);
                            if (redirUrl != null) {
                                fit = queueRedirect(redirUrl, fit);
                            } else {
                                // stop redirecting
                                redirecting = false;
                            }
                            break;
                        case ProtocolStatus.EXCEPTION:
                            logError(fit.url, status.getMessage());
                            int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID());
                            if (killedURLs != 0)
                                context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
                        // retry
                        case ProtocolStatus.RETRY:
                        case ProtocolStatus.BLOCKED:
                            output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                            break;
                        // gone
                        case ProtocolStatus.GONE:
                        case ProtocolStatus.NOTFOUND:
                        case ProtocolStatus.ACCESS_DENIED:
                        case ProtocolStatus.ROBOTS_DENIED:
                            output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
                            break;
                        case ProtocolStatus.NOTMODIFIED:
                            output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
                            break;
                        default:
                            if (LOG.isWarnEnabled()) {
                                LOG.warn("{} {} Unknown ProtocolStatus: {}", getName(), Thread.currentThread().getId(), status.getCode());
                            }
                            output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                    }
                    if (redirecting && redirectCount > maxRedirect) {
                        fetchQueues.finishFetchItem(fit);
                        context.getCounter("FetcherStatus", "redirect_count_exceeded").increment(1);
                        if (LOG.isInfoEnabled()) {
                            LOG.info("{} {} - redirect count exceeded {} ({})", getName(), Thread.currentThread().getId(), fit.url, maxRedirectExceededSkip ? "skipped" : "linked");
                        }
                        if (maxRedirectExceededSkip) {
                        // skip redirect target when redirect count is exceeded
                        } else {
                            Text newUrl = new Text(status.getMessage());
                            CrawlDatum newDatum = createRedirDatum(newUrl, fit, CrawlDatum.STATUS_LINKED);
                            output(newUrl, newDatum, null, null, CrawlDatum.STATUS_LINKED);
                        }
                    }
                } while (redirecting && (redirectCount <= maxRedirect));
            } catch (Throwable t) {
                // unexpected exception
                // unblock
                fetchQueues.finishFetchItem(fit);
                String message;
                if (LOG.isDebugEnabled()) {
                    message = StringUtils.stringifyException(t);
                } else if (logUtil.logShort(t)) {
                    message = t.getClass().getName();
                } else {
                    message = StringUtils.stringifyException(t);
                }
                logError(fit.url, message);
                output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
            }
        }
    } catch (Throwable e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:", e);
        }
    } finally {
        if (fit != null) {
            fetchQueues.finishFetchItem(fit);
        }
        // count threads
        activeThreads.decrementAndGet();
        LOG.info("{} {} -finishing thread {}, activeThreads={}", getName(), Thread.currentThread().getId(), getName(), activeThreads);
    }
}

Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseStatus(org.apache.nutch.parse.ParseStatus) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) BaseRobotRules(crawlercommons.robots.BaseRobotRules) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 4 with ProtocolStatus

use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.

the class SegmentHandler method handle.

@Override
public void handle(Request req, HttpServletResponse res, String target, int dispatch) throws IOException, ServletException {
    try {
        String uri = req.getUri().toString();
        LOG.info("URI: " + uri);
        addMyHeader(res, "URI", uri);
        Text url = new Text(uri.toString());
        CrawlDatum cd = seg.getCrawlDatum(url);
        if (cd != null) {
            addMyHeader(res, "Res", "found");
            LOG.info("-got " + cd.toString());
            ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
            if (ps != null) {
                Integer TrCode = protoCodes.get(ps.getCode());
                if (TrCode != null) {
                    res.setStatus(TrCode.intValue());
                } else {
                    res.setStatus(HttpServletResponse.SC_OK);
                }
                addMyHeader(res, "ProtocolStatus", ps.toString());
            } else {
                res.setStatus(HttpServletResponse.SC_OK);
            }
            Content c = seg.getContent(url);
            if (c == null) {
                // missing content
                req.setHandled(true);
                res.addHeader("X-Handled-By", getClass().getSimpleName());
                return;
            }
            byte[] data = c.getContent();
            LOG.debug("-data len=" + data.length);
            Metadata meta = c.getMetadata();
            String[] names = meta.names();
            LOG.debug("- " + names.length + " meta");
            for (int i = 0; i < names.length; i++) {
                boolean my = true;
                char ch = names[i].charAt(0);
                if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
                    // pretty good chance it's a standard header
                    my = false;
                }
                String[] values = meta.getValues(names[i]);
                for (int k = 0; k < values.length; k++) {
                    if (my) {
                        addMyHeader(res, names[i], values[k]);
                    } else {
                        res.addHeader(names[i], values[k]);
                    }
                }
            }
            req.setHandled(true);
            res.addHeader("X-Handled-By", getClass().getSimpleName());
            res.setContentType(meta.get(Metadata.CONTENT_TYPE));
            res.setContentLength(data.length);
            OutputStream os = res.getOutputStream();
            os.write(data, 0, data.length);
            res.flushBuffer();
        } else {
            addMyHeader(res, "Res", "not found");
            LOG.info(" -not found " + url);
        }
    } catch (Exception e) {
        e.printStackTrace();
        LOG.warn(StringUtils.stringifyException(e));
        addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
    }
}

Also used : OutputStream(java.io.OutputStream) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ServletException(javax.servlet.ServletException) IOException(java.io.IOException) Content(org.apache.nutch.protocol.Content) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 5 with ProtocolStatus

use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.

the class CrawlDatum method execute.

public boolean execute(JexlScript expr, String url) {
    if (expr != null && url != null) {
        // Create a context and add data
        JexlContext jcontext = new MapContext();
        // https://issues.apache.org/jira/browse/NUTCH-2229
        jcontext.set("url", url);
        jcontext.set("status", getStatusName(getStatus()));
        jcontext.set("fetchTime", (long) (getFetchTime()));
        jcontext.set("modifiedTime", (long) (getModifiedTime()));
        jcontext.set("retries", getRetriesSinceFetch());
        jcontext.set("interval", Integer.valueOf(getFetchInterval()));
        jcontext.set("score", getScore());
        jcontext.set("signature", StringUtil.toHexString(getSignature()));
        // Set metadata variables
        for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
            Object value = entry.getValue();
            Text tkey = (Text) entry.getKey();
            if (value instanceof FloatWritable) {
                FloatWritable fvalue = (FloatWritable) value;
                jcontext.set(tkey.toString(), fvalue.get());
            }
            if (value instanceof IntWritable) {
                IntWritable ivalue = (IntWritable) value;
                jcontext.set(tkey.toString(), ivalue.get());
            }
            if (value instanceof Text) {
                Text tvalue = (Text) value;
                jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
            }
            if (value instanceof ProtocolStatus) {
                ProtocolStatus pvalue = (ProtocolStatus) value;
                jcontext.set(tkey.toString().replace("-", "_"), pvalue.toString());
            }
        }
        try {
            if (Boolean.TRUE.equals(expr.execute(jcontext))) {
                return true;
            }
        } catch (Exception e) {
        // 
        }
    }
    return false;
}

Also used : FloatWritable(org.apache.hadoop.io.FloatWritable) JexlContext(org.apache.commons.jexl3.JexlContext) Writable(org.apache.hadoop.io.Writable) FloatWritable(org.apache.hadoop.io.FloatWritable) IntWritable(org.apache.hadoop.io.IntWritable) Text(org.apache.hadoop.io.Text) MapContext(org.apache.commons.jexl3.MapContext) HashMap(java.util.HashMap) Map(java.util.Map) IntWritable(org.apache.hadoop.io.IntWritable) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus) IOException(java.io.IOException) VersionMismatchException(org.apache.hadoop.io.VersionMismatchException)

Aggregations

ProtocolStatus (org.apache.nutch.protocol.ProtocolStatus)10 Text (org.apache.hadoop.io.Text)8 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)6 IOException (java.io.IOException)5 URL (java.net.URL)5 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)4 Content (org.apache.nutch.protocol.Content)4 MalformedURLException (java.net.MalformedURLException)3 FloatWritable (org.apache.hadoop.io.FloatWritable)3 IntWritable (org.apache.hadoop.io.IntWritable)3 BaseRobotRules (crawlercommons.robots.BaseRobotRules)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 VersionMismatchException (org.apache.hadoop.io.VersionMismatchException)2 Writable (org.apache.hadoop.io.Writable)2 Metadata (org.apache.nutch.metadata.Metadata)2 OutputStream (java.io.OutputStream)1 ServletException (javax.servlet.ServletException)1 JexlContext (org.apache.commons.jexl2.JexlContext)1 MapContext (org.apache.commons.jexl2.MapContext)1