Search in sources :

Example 1 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class TestProtocolHttp method fetchPage.

/**
 * Fetches the specified <code>page</code> from the local Jetty server and
 * checks whether the HTTP response status code matches with the expected
 * code. Also use jsp pages for redirection.
 *
 * @param page
 *          Page to be fetched.
 * @param expectedCode
 *          HTTP response status code expected while fetching the page.
 */
private void fetchPage(String page, int expectedCode) throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    CrawlDatum crawlDatum = new CrawlDatum();
    Response response = http.getResponse(url, crawlDatum, true);
    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), crawlDatum);
    Content content = out.getContent();
    assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
    if (page.compareTo("/nonexists.html") != 0 && page.compareTo("/brokenpage.jsp") != 0 && page.compareTo("/redirection") != 0) {
        assertEquals("ContentType " + url, "text/html", content.getContentType());
    }
}
Also used : Response(org.apache.nutch.net.protocols.Response) ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) URL(java.net.URL)

Example 2 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class File method main.

/**
 * Quick way for running this class. Useful for debugging.
 */
public static void main(String[] args) throws Exception {
    int maxContentLength = Integer.MIN_VALUE;
    boolean dumpContent = false;
    String urlString = null;
    String usage = "Usage: File [-maxContentLength L] [-dumpContent] url";
    if (args.length == 0) {
        System.err.println(usage);
        System.exit(-1);
    }
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-maxContentLength")) {
            maxContentLength = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-dumpContent")) {
            dumpContent = true;
        } else if (i != args.length - 1) {
            System.err.println(usage);
            System.exit(-1);
        } else
            urlString = args[i];
    }
    File file = new File();
    file.setConf(NutchConfiguration.create());
    if (// set maxContentLength
    maxContentLength != Integer.MIN_VALUE)
        file.setMaxContentLength(maxContentLength);
    // set log level
    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
    ProtocolOutput output = file.getProtocolOutput(new Text(urlString), new CrawlDatum());
    Content content = output.getContent();
    System.err.println("URL: " + content.getUrl());
    System.err.println("Status: " + output.getStatus());
    System.err.println("Content-Type: " + content.getContentType());
    System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
    System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED));
    String redirectLocation = content.getMetadata().get("Location");
    if (redirectLocation != null) {
        System.err.println("Location: " + redirectLocation);
    }
    if (dumpContent) {
        System.out.print(new String(content.getContent()));
    }
    file = null;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text)

Example 3 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class FtpRobotRulesParser method getRobotRulesSet.

/**
 * The hosts for which the caching of robots rules is yet to be done, it sends
 * a Ftp request to the host corresponding to the {@link URL} passed, gets
 * robots file, parses the rules and caches the rules object to avoid re-work
 * in future.
 *
 * @param ftp
 *          The {@link Protocol} object
 * @param url
 *          URL
 * @param robotsTxtContent
 *          container to store responses when fetching the robots.txt file for
 *          debugging or archival purposes. Instead of a robots.txt file, it
 *          may include redirects or an error page (404, etc.). Response
 *          {@link Content} is appended to the passed list. If null is passed
 *          nothing is stored.
 *
 * @return robotRules A {@link BaseRobotRules} object for the rules
 */
@Override
public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url, List<Content> robotsTxtContent) {
    // normalize to lower
    String protocol = url.getProtocol().toLowerCase();
    // case
    // normalize to lower case
    String host = url.getHost().toLowerCase();
    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
        LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
    }
    BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
    if (robotRules != null) {
        // cached rule
        return robotRules;
    } else if (LOG.isTraceEnabled()) {
        LOG.trace("cache miss " + url);
    }
    boolean cacheRule = true;
    if (isWhiteListed(url)) {
        // check in advance whether a host is whitelisted
        // (we do not need to fetch robots.txt)
        robotRules = EMPTY_RULES;
        LOG.info("Whitelisted host found for: {}", url);
        LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
    } else {
        try {
            Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
            ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, new CrawlDatum());
            ProtocolStatus status = output.getStatus();
            if (robotsTxtContent != null) {
                robotsTxtContent.add(output.getContent());
            }
            if (status.getCode() == ProtocolStatus.SUCCESS) {
                robotRules = parseRules(url.toString(), output.getContent().getContent(), CONTENT_TYPE, agentNames);
            } else {
                // use default rules
                robotRules = EMPTY_RULES;
            }
        } catch (Throwable t) {
            if (LOG.isInfoEnabled()) {
                LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
            }
            // try again later to fetch robots.txt
            cacheRule = false;
            robotRules = EMPTY_RULES;
        }
    }
    if (cacheRule)
        // cache rules for host
        CACHE.put(protocol + ":" + host, robotRules);
    return robotRules;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) BaseRobotRules(crawlercommons.robots.BaseRobotRules) URL(java.net.URL) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 4 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class FetcherThread method run.

@SuppressWarnings("fallthrough")
public void run() {
    // count threads
    activeThreads.incrementAndGet();
    Text url = new Text();
    FetchItem fit = null;
    try {
        // checking for the server to be running and fetcher.parse to be true
        if (parsing && NutchServer.getInstance().isRunning())
            reportToNutchServer = true;
        while (true) {
            // creating FetchNode for storing in FetchNodeDb
            if (reportToNutchServer)
                this.fetchNode = new FetchNode();
            else
                this.fetchNode = null;
            // check whether must be stopped
            if (isHalted()) {
                LOG.debug(getName() + " set to halted");
                fit = null;
                return;
            }
            fit = ((FetchItemQueues) fetchQueues).getFetchItem();
            if (fit != null) {
                URL u = fit.u;
                String temp_url = u.getProtocol() + "://" + u.getHost() + ":" + u.getPort() + u.getFile();
                url = new Text(temp_url);
            }
            if (fit == null) {
                if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
                    LOG.debug(getName() + " spin-waiting ...");
                    // spin-wait.
                    ((AtomicInteger) spinWaiting).incrementAndGet();
                    try {
                        Thread.sleep(500);
                    } catch (Exception e) {
                    }
                    ((AtomicInteger) spinWaiting).decrementAndGet();
                    continue;
                } else {
                    // all done, finish this thread
                    LOG.info(getName() + " " + Thread.currentThread().getId() + " has no more work available");
                    return;
                }
            }
            lastRequestStart.set(System.currentTimeMillis());
            Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
            if (reprUrlWritable == null) {
                setReprUrl(url.toString());
            } else {
                setReprUrl(reprUrlWritable.toString());
            }
            try {
                // fetch the page
                redirecting = false;
                redirectCount = 0;
                // Publisher event
                if (activatePublisher) {
                    FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
                    publisher.publish(startEvent, conf);
                }
                do {
                    if (LOG.isInfoEnabled()) {
                        LOG.info(getName() + " " + Thread.currentThread().getId() + " fetching " + url + " (queue crawl delay=" + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay + "ms)");
                    }
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("redirectCount=" + redirectCount);
                    }
                    redirecting = false;
                    Protocol protocol = this.protocolFactory.getProtocol(url.toString());
                    BaseRobotRules rules = protocol.getRobotRules(url, fit.datum, robotsTxtContent);
                    if (robotsTxtContent != null) {
                        outputRobotsTxt(robotsTxtContent);
                        robotsTxtContent.clear();
                    }
                    if (!rules.isAllowed(fit.u.toString())) {
                        // unblock
                        ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Denied by robots.txt: " + url);
                        }
                        output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                        context.getCounter("FetcherStatus", "robots_denied").increment(1);
                        continue;
                    }
                    if (rules.getCrawlDelay() > 0) {
                        if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                            // unblock
                            ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                            LOG.debug("Crawl-Delay for " + url + " too long (" + rules.getCrawlDelay() + "), skipping");
                            output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                            context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
                            continue;
                        } else {
                            FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
                            fiq.crawlDelay = rules.getCrawlDelay();
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + url);
                            }
                        }
                    }
                    ProtocolOutput output = protocol.getProtocolOutput(url, fit.datum);
                    ProtocolStatus status = output.getStatus();
                    Content content = output.getContent();
                    ParseStatus pstatus = null;
                    // unblock queue
                    ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                    String urlString = url.toString();
                    // used for FetchNode
                    if (fetchNode != null) {
                        fetchNode.setStatus(status.getCode());
                        fetchNode.setFetchTime(System.currentTimeMillis());
                        fetchNode.setUrl(url);
                    }
                    // Publish fetch finish event
                    if (activatePublisher) {
                        FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
                        endEvent.addEventData("status", status.getName());
                        publisher.publish(endEvent, conf);
                    }
                    context.getCounter("FetcherStatus", status.getName()).increment(1);
                    switch(status.getCode()) {
                        case ProtocolStatus.WOULDBLOCK:
                            // retry ?
                            ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                            break;
                        case // got a page
                        ProtocolStatus.SUCCESS:
                            pstatus = output(url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
                            updateStatus(content.getContent().length);
                            if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                                String newUrl = pstatus.getMessage();
                                int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                                Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
                                if (redirUrl != null) {
                                    fit = queueRedirect(redirUrl, fit);
                                }
                            }
                            break;
                        // redirect
                        case ProtocolStatus.MOVED:
                        case ProtocolStatus.TEMP_MOVED:
                            int code;
                            boolean temp;
                            if (status.getCode() == ProtocolStatus.MOVED) {
                                code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                                temp = false;
                            } else {
                                code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                                temp = true;
                            }
                            output(url, fit.datum, content, status, code);
                            String newUrl = status.getMessage();
                            Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR);
                            if (redirUrl != null) {
                                fit = queueRedirect(redirUrl, fit);
                            } else {
                                // stop redirecting
                                redirecting = false;
                            }
                            break;
                        case ProtocolStatus.EXCEPTION:
                            logError(url, status.getMessage());
                            int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID());
                            if (killedURLs != 0)
                                context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
                        // retry
                        case ProtocolStatus.RETRY:
                        case ProtocolStatus.BLOCKED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                            break;
                        // gone
                        case ProtocolStatus.GONE:
                        case ProtocolStatus.NOTFOUND:
                        case ProtocolStatus.ACCESS_DENIED:
                        case ProtocolStatus.ROBOTS_DENIED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
                            break;
                        case ProtocolStatus.NOTMODIFIED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
                            break;
                        default:
                            if (LOG.isWarnEnabled()) {
                                LOG.warn(getName() + " " + Thread.currentThread().getId() + " Unknown ProtocolStatus: " + status.getCode());
                            }
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                    }
                    if (redirecting && redirectCount > maxRedirect) {
                        ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                        if (LOG.isInfoEnabled()) {
                            LOG.info(getName() + " " + Thread.currentThread().getId() + "  - redirect count exceeded " + url);
                        }
                        output(url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE);
                    }
                } while (redirecting && (redirectCount <= maxRedirect));
            } catch (Throwable t) {
                // unexpected exception
                // unblock
                ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                logError(url, StringUtils.stringifyException(t));
                output(url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
            }
        }
    } catch (Throwable e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:" + e.toString());
        }
    } finally {
        if (fit != null)
            ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
        // count threads
        activeThreads.decrementAndGet();
        LOG.info(getName() + " " + Thread.currentThread().getId() + " -finishing thread " + getName() + ", activeThreads=" + activeThreads);
    }
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) URL(java.net.URL) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseStatus(org.apache.nutch.parse.ParseStatus) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) BaseRobotRules(crawlercommons.robots.BaseRobotRules) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 5 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class ParserChecker method run.

public int run(String[] args) throws Exception {
    boolean dumpText = false;
    boolean force = false;
    String contentType = null;
    String url = null;
    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
    if (args.length == 0) {
        LOG.error(usage);
        return (-1);
    }
    // used to simulate the metadata propagated from injection
    HashMap<String, String> metadata = new HashMap<>();
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-forceAs")) {
            force = true;
            contentType = args[++i];
        } else if (args[i].equals("-dumpText")) {
            dumpText = true;
        } else if (args[i].equals("-md")) {
            String k = null, v = null;
            String nextOne = args[++i];
            int firstEquals = nextOne.indexOf("=");
            if (firstEquals != -1) {
                k = nextOne.substring(0, firstEquals);
                v = nextOne.substring(firstEquals + 1);
            } else
                k = nextOne;
            metadata.put(k, v);
        } else if (i != args.length - 1) {
            LOG.error(usage);
            System.exit(-1);
        } else {
            url = URLUtil.toASCII(args[i]);
        }
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("fetching: " + url);
    }
    CrawlDatum cd = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key = iter.next();
        String value = metadata.get(key);
        if (value == null)
            value = "";
        cd.getMetaData().put(new Text(key), new Text(value));
    }
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Text turl = new Text(url);
    ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
    // if the configuration permits, handle redirects until we either run
    // out of allowed redirects or we stop getting redirect statuses.
    int maxRedirects = conf.getInt("http.redirect.max", 0);
    int numRedirects = 0;
    while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
        String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
        LOG.info("Handling redirect to " + newURL);
        protocol = factory.getProtocol(newURL);
        turl = new Text(newURL);
        output = protocol.getProtocolOutput(turl, cd);
        numRedirects++;
    }
    if (!output.getStatus().isSuccess()) {
        System.err.println("Fetch failed with protocol status: " + output.getStatus());
        if (output.getStatus().isRedirect()) {
            System.err.println("Redirect(s) not handled due to configuration.");
            System.err.println("Max Redirects to handle per config: " + maxRedirects);
            System.err.println("Number of Redirects handled: " + numRedirects);
        }
        return (-1);
    }
    Content content = output.getContent();
    if (content == null) {
        LOG.error("No content for " + url);
        return (-1);
    }
    if (force) {
        content.setContentType(contentType);
    } else {
        contentType = content.getContentType();
    }
    if (contentType == null) {
        LOG.error("Failed to determine content type!");
        return (-1);
    }
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    }
    ScoringFilters scfilters = new ScoringFilters(conf);
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, cd, content);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")");
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
    ParseResult parseResult = new ParseUtil(conf).parse(content);
    if (parseResult == null) {
        LOG.error("Parsing content failed!");
        return (-1);
    }
    // calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new Text(url)));
    if (LOG.isInfoEnabled()) {
        LOG.info("parsing: " + url);
        LOG.info("contentType: " + contentType);
        LOG.info("signature: " + StringUtil.toHexString(signature));
    }
    Parse parse = parseResult.get(turl);
    if (parse == null) {
        LOG.error("Failed to get parse from parse result");
        LOG.error("Available parses in parse result (by URL key):");
        for (Map.Entry<Text, Parse> entry : parseResult) {
            LOG.error("  " + entry.getKey());
        }
        LOG.error("Parse result does not contain a parse for URL to be checked:");
        LOG.error("  " + turl);
        return -1;
    }
    // call the scoring filters
    try {
        scfilters.passScoreAfterParsing(turl, content, parse);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")");
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
    for (Map.Entry<Text, Parse> entry : parseResult) {
        parse = entry.getValue();
        LOG.info("---------\nUrl\n---------------\n");
        System.out.print(entry.getKey());
        LOG.info("\n---------\nParseData\n---------\n");
        System.out.print(parse.getData().toString());
        if (dumpText) {
            LOG.info("---------\nParseText\n---------\n");
            System.out.print(parse.getText());
        }
    }
    return 0;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) HashMap(java.util.HashMap) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) Protocol(org.apache.nutch.protocol.Protocol) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)12 Text (org.apache.hadoop.io.Text)11 Content (org.apache.nutch.protocol.Content)7 URL (java.net.URL)6 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)6 ProtocolStatus (org.apache.nutch.protocol.ProtocolStatus)5 Protocol (org.apache.nutch.protocol.Protocol)4 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)3 BaseRobotRules (crawlercommons.robots.BaseRobotRules)2 IOException (java.io.IOException)2 MalformedURLException (java.net.MalformedURLException)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 Response (org.apache.nutch.net.protocols.Response)2 ScoringFilters (org.apache.nutch.scoring.ScoringFilters)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 IntWritable (org.apache.hadoop.io.IntWritable)1 Inlinks (org.apache.nutch.crawl.Inlinks)1 URLFilterException (org.apache.nutch.net.URLFilterException)1 Parse (org.apache.nutch.parse.Parse)1