Search in sources :

Example 11 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class JSParseFilter method filter.

public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags metaTags, DocumentFragment doc) {
    Parse parse = parseResult.get(content.getUrl());
    String url = content.getBaseUrl();
    ArrayList<Outlink> outlinks = new ArrayList<Outlink>();
    walk(doc, parse, metaTags, url, outlinks);
    if (outlinks.size() > 0) {
        Outlink[] old = parse.getData().getOutlinks();
        String title = parse.getData().getTitle();
        List<Outlink> list = Arrays.asList(old);
        outlinks.addAll(list);
        ParseStatus status = parse.getData().getStatus();
        String text = parse.getText();
        Outlink[] newlinks = (Outlink[]) outlinks.toArray(new Outlink[outlinks.size()]);
        ParseData parseData = new ParseData(status, title, newlinks, parse.getData().getContentMeta(), parse.getData().getParseMeta());
        // replace original parse obj with new one
        parseResult.put(content.getUrl(), new ParseText(text), parseData);
    }
    return parseResult;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) ParseData(org.apache.nutch.parse.ParseData) Parse(org.apache.nutch.parse.Parse) ArrayList(java.util.ArrayList) ParseText(org.apache.nutch.parse.ParseText)

Example 12 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class TestIndexingFilters method testNonExistingIndexingFilter.

/**
 * Test behaviour when defined filter does not exist.
 *
 * @throws IndexingException
 */
@Test
public void testNonExistingIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");
    String class1 = "NonExistingFilter";
    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 13 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class FetcherThread method output.

private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) throws InterruptedException {
    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
        datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
    ParseResult parseResult = null;
    if (content != null) {
        Metadata metadata = content.getMetadata();
        // store the guessed content type in the crawldatum
        if (content.getContentType() != null)
            datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
            scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
            }
        }
        /*
       * Note: Fetcher will only follow meta-redirects coming from the
       * original URL.
       */
        if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
            if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
                try {
                    parseResult = this.parseUtil.parse(content);
                } catch (Exception e) {
                    LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + StringUtils.stringifyException(e));
                }
            }
            if (parseResult == null) {
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, new ParseStatus().getEmptyParse(conf));
                datum.setSignature(signature);
            }
        }
        /*
       * Store status code in content So we can read this value during parsing
       * (as a separate job) and decide to parse or not.
       */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
    }
    try {
        context.write(key, new NutchWritable(datum));
        if (content != null && storingContent)
            context.write(key, new NutchWritable(content));
        if (parseResult != null) {
            for (Entry<Text, Parse> entry : parseResult) {
                Text url = entry.getKey();
                Parse parse = entry.getValue();
                ParseStatus parseStatus = parse.getData().getStatus();
                ParseData parseData = parse.getData();
                if (!parseStatus.isSuccess()) {
                    LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + parseStatus);
                    parse = parseStatus.getEmptyParse(conf);
                }
                // Calculate page signature. For non-parsing fetchers this will
                // be done in ParseSegment
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
                // Ensure segment name and score are in parseData metadata
                parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
                parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
                // Pass fetch time to content meta
                parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
                if (url.equals(key))
                    datum.setSignature(signature);
                try {
                    scfilters.passScoreAfterParsing(url, content, parse);
                } catch (Exception e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
                    }
                }
                String origin = null;
                // collect outlinks for subsequent db update
                Outlink[] links = parseData.getOutlinks();
                int outlinksToStore = Math.min(maxOutlinks, links.length);
                if (ignoreExternalLinks || ignoreInternalLinks) {
                    URL originURL = new URL(url.toString());
                    // based on domain?
                    if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
                        origin = URLUtil.getDomainName(originURL).toLowerCase();
                    } else // use host
                    {
                        origin = originURL.getHost().toLowerCase();
                    }
                }
                // used by fetchNode
                if (fetchNode != null) {
                    fetchNode.setOutlinks(links);
                    fetchNode.setTitle(parseData.getTitle());
                    FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
                }
                int validCount = 0;
                // Process all outlinks, normalize, filter and deduplicate
                List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
                HashSet<String> outlinks = new HashSet<>(outlinksToStore);
                for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                    String toUrl = links[i].getToUrl();
                    toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks);
                    if (toUrl == null) {
                        continue;
                    }
                    validCount++;
                    links[i].setUrl(toUrl);
                    outlinkList.add(links[i]);
                    outlinks.add(toUrl);
                }
                // Publish fetch report event
                if (activatePublisher) {
                    FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT, url.toString());
                    reportEvent.addOutlinksToEventData(outlinkList);
                    reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE, parseData.getContentMeta().get("content-type"));
                    reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language"));
                    publisher.publish(reportEvent, conf);
                }
                // Only process depth N outlinks
                if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
                    FetchItem ft = FetchItem.create(url, null, queueMode);
                    FetchItemQueue queue = ((FetchItemQueues) fetchQueues).getFetchItemQueue(ft.queueID);
                    queue.alreadyFetched.add(url.toString().hashCode());
                    context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size());
                    // Counter to limit num outlinks to follow per page
                    int outlinkCounter = 0;
                    // Calculate variable number of outlinks by depth using the
                    // divisor (outlinks = Math.floor(divisor / depth * num.links))
                    int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
                    String followUrl;
                    // Walk over the outlinks and add as new FetchItem to the queues
                    Iterator<String> iter = outlinks.iterator();
                    while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
                        followUrl = iter.next();
                        // Check whether we'll follow external outlinks
                        if (outlinksIgnoreExternal) {
                            if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                                continue;
                            }
                        }
                        // Already followed?
                        int urlHashCode = followUrl.hashCode();
                        if (queue.alreadyFetched.contains(urlHashCode)) {
                            continue;
                        }
                        queue.alreadyFetched.add(urlHashCode);
                        // Create new FetchItem with depth incremented
                        FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
                        context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);
                        ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                        outlinkCounter++;
                    }
                }
                // Overwrite the outlinks in ParseData with the normalized and
                // filtered set
                parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
                context.write(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
            }
        }
    } catch (IOException e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:" + e.toString());
        }
    }
    // return parse status if it exits
    if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
            context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()]).increment(1);
            return p.getData().getStatus();
        }
    }
    return null;
}
Also used : Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) ParseText(org.apache.nutch.parse.ParseText) ParseStatus(org.apache.nutch.parse.ParseStatus) HashSet(java.util.HashSet) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) NutchWritable(org.apache.nutch.crawl.NutchWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) IOException(java.io.IOException) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 14 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class FetcherThread method run.

@SuppressWarnings("fallthrough")
public void run() {
    // count threads
    activeThreads.incrementAndGet();
    Text url = new Text();
    FetchItem fit = null;
    try {
        // checking for the server to be running and fetcher.parse to be true
        if (parsing && NutchServer.getInstance().isRunning())
            reportToNutchServer = true;
        while (true) {
            // creating FetchNode for storing in FetchNodeDb
            if (reportToNutchServer)
                this.fetchNode = new FetchNode();
            else
                this.fetchNode = null;
            // check whether must be stopped
            if (isHalted()) {
                LOG.debug(getName() + " set to halted");
                fit = null;
                return;
            }
            fit = ((FetchItemQueues) fetchQueues).getFetchItem();
            if (fit != null) {
                URL u = fit.u;
                String temp_url = u.getProtocol() + "://" + u.getHost() + ":" + u.getPort() + u.getFile();
                url = new Text(temp_url);
            }
            if (fit == null) {
                if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
                    LOG.debug(getName() + " spin-waiting ...");
                    // spin-wait.
                    ((AtomicInteger) spinWaiting).incrementAndGet();
                    try {
                        Thread.sleep(500);
                    } catch (Exception e) {
                    }
                    ((AtomicInteger) spinWaiting).decrementAndGet();
                    continue;
                } else {
                    // all done, finish this thread
                    LOG.info(getName() + " " + Thread.currentThread().getId() + " has no more work available");
                    return;
                }
            }
            lastRequestStart.set(System.currentTimeMillis());
            Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
            if (reprUrlWritable == null) {
                setReprUrl(url.toString());
            } else {
                setReprUrl(reprUrlWritable.toString());
            }
            try {
                // fetch the page
                redirecting = false;
                redirectCount = 0;
                // Publisher event
                if (activatePublisher) {
                    FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
                    publisher.publish(startEvent, conf);
                }
                do {
                    if (LOG.isInfoEnabled()) {
                        LOG.info(getName() + " " + Thread.currentThread().getId() + " fetching " + url + " (queue crawl delay=" + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay + "ms)");
                    }
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("redirectCount=" + redirectCount);
                    }
                    redirecting = false;
                    Protocol protocol = this.protocolFactory.getProtocol(url.toString());
                    BaseRobotRules rules = protocol.getRobotRules(url, fit.datum, robotsTxtContent);
                    if (robotsTxtContent != null) {
                        outputRobotsTxt(robotsTxtContent);
                        robotsTxtContent.clear();
                    }
                    if (!rules.isAllowed(fit.u.toString())) {
                        // unblock
                        ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Denied by robots.txt: " + url);
                        }
                        output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                        context.getCounter("FetcherStatus", "robots_denied").increment(1);
                        continue;
                    }
                    if (rules.getCrawlDelay() > 0) {
                        if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
                            // unblock
                            ((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
                            LOG.debug("Crawl-Delay for " + url + " too long (" + rules.getCrawlDelay() + "), skipping");
                            output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
                            context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
                            continue;
                        } else {
                            FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
                            fiq.crawlDelay = rules.getCrawlDelay();
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + url);
                            }
                        }
                    }
                    ProtocolOutput output = protocol.getProtocolOutput(url, fit.datum);
                    ProtocolStatus status = output.getStatus();
                    Content content = output.getContent();
                    ParseStatus pstatus = null;
                    // unblock queue
                    ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                    String urlString = url.toString();
                    // used for FetchNode
                    if (fetchNode != null) {
                        fetchNode.setStatus(status.getCode());
                        fetchNode.setFetchTime(System.currentTimeMillis());
                        fetchNode.setUrl(url);
                    }
                    // Publish fetch finish event
                    if (activatePublisher) {
                        FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
                        endEvent.addEventData("status", status.getName());
                        publisher.publish(endEvent, conf);
                    }
                    context.getCounter("FetcherStatus", status.getName()).increment(1);
                    switch(status.getCode()) {
                        case ProtocolStatus.WOULDBLOCK:
                            // retry ?
                            ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                            break;
                        case // got a page
                        ProtocolStatus.SUCCESS:
                            pstatus = output(url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
                            updateStatus(content.getContent().length);
                            if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
                                String newUrl = pstatus.getMessage();
                                int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
                                Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
                                if (redirUrl != null) {
                                    fit = queueRedirect(redirUrl, fit);
                                }
                            }
                            break;
                        // redirect
                        case ProtocolStatus.MOVED:
                        case ProtocolStatus.TEMP_MOVED:
                            int code;
                            boolean temp;
                            if (status.getCode() == ProtocolStatus.MOVED) {
                                code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
                                temp = false;
                            } else {
                                code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
                                temp = true;
                            }
                            output(url, fit.datum, content, status, code);
                            String newUrl = status.getMessage();
                            Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR);
                            if (redirUrl != null) {
                                fit = queueRedirect(redirUrl, fit);
                            } else {
                                // stop redirecting
                                redirecting = false;
                            }
                            break;
                        case ProtocolStatus.EXCEPTION:
                            logError(url, status.getMessage());
                            int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID());
                            if (killedURLs != 0)
                                context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
                        // retry
                        case ProtocolStatus.RETRY:
                        case ProtocolStatus.BLOCKED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                            break;
                        // gone
                        case ProtocolStatus.GONE:
                        case ProtocolStatus.NOTFOUND:
                        case ProtocolStatus.ACCESS_DENIED:
                        case ProtocolStatus.ROBOTS_DENIED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
                            break;
                        case ProtocolStatus.NOTMODIFIED:
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
                            break;
                        default:
                            if (LOG.isWarnEnabled()) {
                                LOG.warn(getName() + " " + Thread.currentThread().getId() + " Unknown ProtocolStatus: " + status.getCode());
                            }
                            output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
                    }
                    if (redirecting && redirectCount > maxRedirect) {
                        ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                        if (LOG.isInfoEnabled()) {
                            LOG.info(getName() + " " + Thread.currentThread().getId() + "  - redirect count exceeded " + url);
                        }
                        output(url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE);
                    }
                } while (redirecting && (redirectCount <= maxRedirect));
            } catch (Throwable t) {
                // unexpected exception
                // unblock
                ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
                logError(url, StringUtils.stringifyException(t));
                output(url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
            }
        }
    } catch (Throwable e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:" + e.toString());
        }
    } finally {
        if (fit != null)
            ((FetchItemQueues) fetchQueues).finishFetchItem(fit);
        // count threads
        activeThreads.decrementAndGet();
        LOG.info(getName() + " " + Thread.currentThread().getId() + " -finishing thread " + getName() + ", activeThreads=" + activeThreads);
    }
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) URL(java.net.URL) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseStatus(org.apache.nutch.parse.ParseStatus) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) BaseRobotRules(crawlercommons.robots.BaseRobotRules) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 15 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class TestLinksIndexingFilter method testFilterOutlinks.

@Test
public void testFilterOutlinks() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    filter.setConf(conf);
    Outlink[] outlinks = generateOutlinks();
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
    Assert.assertEquals("Filter outlinks, allow only those from a different host", outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Aggregations

ParseStatus (org.apache.nutch.parse.ParseStatus)25 ParseData (org.apache.nutch.parse.ParseData)23 ParseImpl (org.apache.nutch.parse.ParseImpl)21 Outlink (org.apache.nutch.parse.Outlink)16 Text (org.apache.hadoop.io.Text)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)14 Inlinks (org.apache.nutch.crawl.Inlinks)13 Test (org.junit.Test)11 NutchDocument (org.apache.nutch.indexer.NutchDocument)10 Metadata (org.apache.nutch.metadata.Metadata)9 URL (java.net.URL)8 IOException (java.io.IOException)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 MalformedURLException (java.net.MalformedURLException)5 ArrayList (java.util.ArrayList)5 Configuration (org.apache.hadoop.conf.Configuration)5 Parse (org.apache.nutch.parse.Parse)5 ParseText (org.apache.nutch.parse.ParseText)5 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)5 Inlink (org.apache.nutch.crawl.Inlink)4