Search in sources :

Example 31 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class SegmentHandler method handle.

@Override
public void handle(Request req, HttpServletResponse res, String target, int dispatch) throws IOException, ServletException {
    try {
        String uri = req.getUri().toString();
        LOG.info("URI: " + uri);
        addMyHeader(res, "URI", uri);
        Text url = new Text(uri.toString());
        CrawlDatum cd = seg.getCrawlDatum(url);
        if (cd != null) {
            addMyHeader(res, "Res", "found");
            LOG.info("-got " + cd.toString());
            ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
            if (ps != null) {
                Integer TrCode = protoCodes.get(ps.getCode());
                if (TrCode != null) {
                    res.setStatus(TrCode.intValue());
                } else {
                    res.setStatus(HttpServletResponse.SC_OK);
                }
                addMyHeader(res, "ProtocolStatus", ps.toString());
            } else {
                res.setStatus(HttpServletResponse.SC_OK);
            }
            Content c = seg.getContent(url);
            if (c == null) {
                // missing content
                req.setHandled(true);
                res.addHeader("X-Handled-By", getClass().getSimpleName());
                return;
            }
            byte[] data = c.getContent();
            LOG.debug("-data len=" + data.length);
            Metadata meta = c.getMetadata();
            String[] names = meta.names();
            LOG.debug("- " + names.length + " meta");
            for (int i = 0; i < names.length; i++) {
                boolean my = true;
                char ch = names[i].charAt(0);
                if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
                    // pretty good chance it's a standard header
                    my = false;
                }
                String[] values = meta.getValues(names[i]);
                for (int k = 0; k < values.length; k++) {
                    if (my) {
                        addMyHeader(res, names[i], values[k]);
                    } else {
                        res.addHeader(names[i], values[k]);
                    }
                }
            }
            req.setHandled(true);
            res.addHeader("X-Handled-By", getClass().getSimpleName());
            res.setContentType(meta.get(Metadata.CONTENT_TYPE));
            res.setContentLength(data.length);
            OutputStream os = res.getOutputStream();
            os.write(data, 0, data.length);
            res.flushBuffer();
        } else {
            addMyHeader(res, "Res", "not found");
            LOG.info(" -not found " + url);
        }
    } catch (Exception e) {
        e.printStackTrace();
        LOG.warn(StringUtils.stringifyException(e));
        addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
    }
}
Also used : OutputStream(java.io.OutputStream) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ServletException(javax.servlet.ServletException) IOException(java.io.IOException) Content(org.apache.nutch.protocol.Content) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 32 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class FetcherThread method output.

private ParseStatus output(Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) throws InterruptedException {
    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null)
        datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);
    ParseResult parseResult = null;
    if (content != null) {
        Metadata metadata = content.getMetadata();
        // store the guessed content type in the crawldatum
        if (content.getContentType() != null)
            datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));
        // add segment to metadata
        metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
        // add score to content metadata so that ParseSegment can pick it up.
        try {
            scfilters.passScoreBeforeParsing(key, datum, content);
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
            }
        }
        /*
       * Note: Fetcher will only follow meta-redirects coming from the
       * original URL.
       */
        if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
            if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
                try {
                    parseResult = this.parseUtil.parse(content);
                } catch (Exception e) {
                    LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + StringUtils.stringifyException(e));
                }
            }
            if (parseResult == null) {
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, new ParseStatus().getEmptyParse(conf));
                datum.setSignature(signature);
            }
        }
        /*
       * Store status code in content So we can read this value during parsing
       * (as a separate job) and decide to parse or not.
       */
        content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
    }
    try {
        context.write(key, new NutchWritable(datum));
        if (content != null && storingContent)
            context.write(key, new NutchWritable(content));
        if (parseResult != null) {
            for (Entry<Text, Parse> entry : parseResult) {
                Text url = entry.getKey();
                Parse parse = entry.getValue();
                ParseStatus parseStatus = parse.getData().getStatus();
                ParseData parseData = parse.getData();
                if (!parseStatus.isSuccess()) {
                    LOG.warn(getName() + " " + Thread.currentThread().getId() + " Error parsing: " + key + ": " + parseStatus);
                    parse = parseStatus.getEmptyParse(conf);
                }
                // Calculate page signature. For non-parsing fetchers this will
                // be done in ParseSegment
                byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
                // Ensure segment name and score are in parseData metadata
                parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
                parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
                // Pass fetch time to content meta
                parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
                if (url.equals(key))
                    datum.setSignature(signature);
                try {
                    scfilters.passScoreAfterParsing(url, content, parse);
                } catch (Exception e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn(getName() + " " + Thread.currentThread().getId() + " Couldn't pass score, url " + key + " (" + e + ")");
                    }
                }
                String origin = null;
                // collect outlinks for subsequent db update
                Outlink[] links = parseData.getOutlinks();
                int outlinksToStore = Math.min(maxOutlinks, links.length);
                if (ignoreExternalLinks || ignoreInternalLinks) {
                    URL originURL = new URL(url.toString());
                    // based on domain?
                    if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
                        origin = URLUtil.getDomainName(originURL).toLowerCase();
                    } else // use host
                    {
                        origin = originURL.getHost().toLowerCase();
                    }
                }
                // used by fetchNode
                if (fetchNode != null) {
                    fetchNode.setOutlinks(links);
                    fetchNode.setTitle(parseData.getTitle());
                    FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
                }
                int validCount = 0;
                // Process all outlinks, normalize, filter and deduplicate
                List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
                HashSet<String> outlinks = new HashSet<>(outlinksToStore);
                for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
                    String toUrl = links[i].getToUrl();
                    toUrl = ParseOutputFormat.filterNormalize(url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFiltersForOutlinks, urlExemptionFilters, normalizersForOutlinks);
                    if (toUrl == null) {
                        continue;
                    }
                    validCount++;
                    links[i].setUrl(toUrl);
                    outlinkList.add(links[i]);
                    outlinks.add(toUrl);
                }
                // Publish fetch report event
                if (activatePublisher) {
                    FetcherThreadEvent reportEvent = new FetcherThreadEvent(PublishEventType.REPORT, url.toString());
                    reportEvent.addOutlinksToEventData(outlinkList);
                    reportEvent.addEventData(Nutch.FETCH_EVENT_TITLE, parseData.getTitle());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTTYPE, parseData.getContentMeta().get("content-type"));
                    reportEvent.addEventData(Nutch.FETCH_EVENT_SCORE, datum.getScore());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_FETCHTIME, datum.getFetchTime());
                    reportEvent.addEventData(Nutch.FETCH_EVENT_CONTENTLANG, parseData.getContentMeta().get("content-language"));
                    publisher.publish(reportEvent, conf);
                }
                // Only process depth N outlinks
                if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
                    FetchItem ft = FetchItem.create(url, null, queueMode);
                    FetchItemQueue queue = ((FetchItemQueues) fetchQueues).getFetchItemQueue(ft.queueID);
                    queue.alreadyFetched.add(url.toString().hashCode());
                    context.getCounter("FetcherOutlinks", "outlinks_detected").increment(outlinks.size());
                    // Counter to limit num outlinks to follow per page
                    int outlinkCounter = 0;
                    // Calculate variable number of outlinks by depth using the
                    // divisor (outlinks = Math.floor(divisor / depth * num.links))
                    int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);
                    String followUrl;
                    // Walk over the outlinks and add as new FetchItem to the queues
                    Iterator<String> iter = outlinks.iterator();
                    while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
                        followUrl = iter.next();
                        // Check whether we'll follow external outlinks
                        if (outlinksIgnoreExternal) {
                            if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                                continue;
                            }
                        }
                        // Already followed?
                        int urlHashCode = followUrl.hashCode();
                        if (queue.alreadyFetched.contains(urlHashCode)) {
                            continue;
                        }
                        queue.alreadyFetched.add(urlHashCode);
                        // Create new FetchItem with depth incremented
                        FetchItem fit = FetchItem.create(new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1);
                        context.getCounter("FetcherOutlinks", "outlinks_following").increment(1);
                        ((FetchItemQueues) fetchQueues).addFetchItem(fit);
                        outlinkCounter++;
                    }
                }
                // Overwrite the outlinks in ParseData with the normalized and
                // filtered set
                parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));
                context.write(url, new NutchWritable(new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
            }
        }
    } catch (IOException e) {
        if (LOG.isErrorEnabled()) {
            LOG.error("fetcher caught:" + e.toString());
        }
    }
    // return parse status if it exits
    if (parseResult != null && !parseResult.isEmpty()) {
        Parse p = parseResult.get(content.getUrl());
        if (p != null) {
            context.getCounter("ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()]).increment(1);
            return p.getData().getStatus();
        }
    }
    return null;
}
Also used : Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) ArrayList(java.util.ArrayList) URL(java.net.URL) ParseText(org.apache.nutch.parse.ParseText) ParseStatus(org.apache.nutch.parse.ParseStatus) HashSet(java.util.HashSet) Outlink(org.apache.nutch.parse.Outlink) ParseResult(org.apache.nutch.parse.ParseResult) NutchWritable(org.apache.nutch.crawl.NutchWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) IOException(java.io.IOException) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) URLFilterException(org.apache.nutch.net.URLFilterException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl)

Example 33 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class FetcherThread method handleRedirect.

private Text handleRedirect(Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType) throws MalformedURLException, URLFilterException, InterruptedException {
    newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
    newUrl = urlFilters.filter(newUrl);
    if (newUrl == null || newUrl.equals(urlString)) {
        LOG.debug(" - {} redirect skipped: {}", redirType, (newUrl != null ? "to same url" : "filtered"));
        return null;
    }
    if (ignoreAlsoRedirects && (ignoreExternalLinks || ignoreInternalLinks)) {
        try {
            URL origUrl = new URL(urlString);
            URL redirUrl = new URL(newUrl);
            if (ignoreExternalLinks) {
                String origHostOrDomain, newHostOrDomain;
                if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
                    origHostOrDomain = URLUtil.getDomainName(origUrl).toLowerCase();
                    newHostOrDomain = URLUtil.getDomainName(redirUrl).toLowerCase();
                } else {
                    // byHost
                    origHostOrDomain = origUrl.getHost().toLowerCase();
                    newHostOrDomain = redirUrl.getHost().toLowerCase();
                }
                if (!origHostOrDomain.equals(newHostOrDomain)) {
                    LOG.debug(" - ignoring redirect {} from {} to {} because external links are ignored", redirType, urlString, newUrl);
                    return null;
                }
            }
            if (ignoreInternalLinks) {
                String origHost = origUrl.getHost().toLowerCase();
                String newHost = redirUrl.getHost().toLowerCase();
                if (origHost.equals(newHost)) {
                    LOG.debug(" - ignoring redirect {} from {} to {} because internal links are ignored", redirType, urlString, newUrl);
                    return null;
                }
            }
        } catch (MalformedURLException e) {
            return null;
        }
    }
    reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
    url = new Text(newUrl);
    if (maxRedirect > 0) {
        redirecting = true;
        redirectCount++;
        LOG.debug(" - {} redirect to {} (fetching now)", redirType, url);
        return url;
    } else {
        CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(), datum.getScore());
        // transfer existing metadata
        newDatum.getMetaData().putAll(datum.getMetaData());
        try {
            scfilters.initialScore(url, newDatum);
        } catch (ScoringFilterException e) {
            e.printStackTrace();
        }
        if (reprUrl != null) {
            newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
        }
        output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
        LOG.debug(" - {} redirect to {} (fetching later)", redirType, url);
        return null;
    }
}
Also used : MalformedURLException(java.net.MalformedURLException) ScoringFilterException(org.apache.nutch.scoring.ScoringFilterException) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ParseText(org.apache.nutch.parse.ParseText) URL(java.net.URL)

Example 34 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class UpdateHostDbMapper method map.

/**
 * Mapper ingesting records from the HostDB, CrawlDB and plaintext host
 * scores file. Statistics and scores are passed on.
 *
 * @param key
 * @param value
 * @param context
 */
public void map(Text key, Writable value, Context context) throws IOException, InterruptedException {
    // Get the key!
    String keyStr = key.toString();
    // Check if we process records from the CrawlDB
    if (key instanceof Text && value instanceof CrawlDatum) {
        // Get the normalized and filtered host of this URL
        buffer = filterNormalize(URLUtil.getHost(keyStr));
        // Filtered out?
        if (buffer == null) {
            context.getCounter("UpdateHostDb", "filtered_records").increment(1);
            LOG.info("UpdateHostDb: " + URLUtil.getHost(keyStr) + " crawldatum has been filtered");
            return;
        }
        // Set the host of this URL
        host.set(buffer);
        crawlDatum = (CrawlDatum) value;
        hostDatum = new HostDatum();
        // Do not resolve homepages when the root URL is unfetched
        if (crawlDatum.getStatus() != CrawlDatum.STATUS_DB_UNFETCHED) {
            // Get the protocol
            String protocol = URLUtil.getProtocol(keyStr);
            // Get the proposed homepage URL
            String homepage = protocol + "://" + buffer + "/";
            // Check if the current key is equals the host
            if (keyStr.equals(homepage)) {
                // Check if this is a redirect to the real home page
                if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
                    // Obtain the repr url for this redirect via protocolstatus from the metadata
                    ProtocolStatus z = (ProtocolStatus) crawlDatum.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
                    // Get the protocol status' arguments
                    args = z.getArgs();
                    // ..and the possible redirect URL
                    reprUrl = args[0];
                    // Am i a redirect?
                    if (reprUrl != null) {
                        LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0]);
                        context.write(host, new NutchWritable(hostDatum));
                        hostDatum.setHomepageUrl(reprUrl);
                    } else {
                        LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0] + " but has been filtered out");
                    }
                } else {
                    hostDatum.setHomepageUrl(homepage);
                    context.write(host, new NutchWritable(hostDatum));
                    LOG.info("UpdateHostDb: homepage: " + homepage);
                }
            }
        }
        // Always emit crawl datum
        context.write(host, new NutchWritable(crawlDatum));
    }
    // Check if we got a record from the hostdb
    if (key instanceof Text && value instanceof HostDatum) {
        buffer = filterNormalize(keyStr);
        // Filtered out?
        if (buffer == null) {
            context.getCounter("UpdateHostDb", "filtered_records").increment(1);
            LOG.info("UpdateHostDb: " + key.toString() + " hostdatum has been filtered");
            return;
        }
        // Get a HostDatum
        hostDatum = (HostDatum) value;
        key.set(buffer);
        // we're aggregating them from CrawlDB anyway
        if (readingCrawlDb) {
            hostDatum.resetStatistics();
        }
        context.write(key, new NutchWritable(hostDatum));
    }
    // Check if we got a record with host scores
    if (key instanceof Text && value instanceof Text) {
        buffer = filterNormalize(keyStr);
        // Filtered out?
        if (buffer == null) {
            context.getCounter("UpdateHostDb", "filtered_records").increment(1);
            LOG.info("UpdateHostDb: " + key.toString() + " score has been filtered");
            return;
        }
        key.set(buffer);
        context.write(key, new NutchWritable(new FloatWritable(Float.parseFloat(value.toString()))));
    }
}
Also used : FloatWritable(org.apache.hadoop.io.FloatWritable) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) NutchWritable(org.apache.nutch.crawl.NutchWritable) Text(org.apache.hadoop.io.Text) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Example 35 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class ParserChecker method run.

public int run(String[] args) throws Exception {
    boolean dumpText = false;
    boolean force = false;
    String contentType = null;
    String url = null;
    String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
    if (args.length == 0) {
        LOG.error(usage);
        return (-1);
    }
    // used to simulate the metadata propagated from injection
    HashMap<String, String> metadata = new HashMap<>();
    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-forceAs")) {
            force = true;
            contentType = args[++i];
        } else if (args[i].equals("-dumpText")) {
            dumpText = true;
        } else if (args[i].equals("-md")) {
            String k = null, v = null;
            String nextOne = args[++i];
            int firstEquals = nextOne.indexOf("=");
            if (firstEquals != -1) {
                k = nextOne.substring(0, firstEquals);
                v = nextOne.substring(firstEquals + 1);
            } else
                k = nextOne;
            metadata.put(k, v);
        } else if (i != args.length - 1) {
            LOG.error(usage);
            System.exit(-1);
        } else {
            url = URLUtil.toASCII(args[i]);
        }
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("fetching: " + url);
    }
    CrawlDatum cd = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key = iter.next();
        String value = metadata.get(key);
        if (value == null)
            value = "";
        cd.getMetaData().put(new Text(key), new Text(value));
    }
    ProtocolFactory factory = new ProtocolFactory(conf);
    Protocol protocol = factory.getProtocol(url);
    Text turl = new Text(url);
    ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
    // if the configuration permits, handle redirects until we either run
    // out of allowed redirects or we stop getting redirect statuses.
    int maxRedirects = conf.getInt("http.redirect.max", 0);
    int numRedirects = 0;
    while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
        String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
        LOG.info("Handling redirect to " + newURL);
        protocol = factory.getProtocol(newURL);
        turl = new Text(newURL);
        output = protocol.getProtocolOutput(turl, cd);
        numRedirects++;
    }
    if (!output.getStatus().isSuccess()) {
        System.err.println("Fetch failed with protocol status: " + output.getStatus());
        if (output.getStatus().isRedirect()) {
            System.err.println("Redirect(s) not handled due to configuration.");
            System.err.println("Max Redirects to handle per config: " + maxRedirects);
            System.err.println("Number of Redirects handled: " + numRedirects);
        }
        return (-1);
    }
    Content content = output.getContent();
    if (content == null) {
        LOG.error("No content for " + url);
        return (-1);
    }
    if (force) {
        content.setContentType(contentType);
    } else {
        contentType = content.getContentType();
    }
    if (contentType == null) {
        LOG.error("Failed to determine content type!");
        return (-1);
    }
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    }
    ScoringFilters scfilters = new ScoringFilters(conf);
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, cd, content);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")");
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
    ParseResult parseResult = new ParseUtil(conf).parse(content);
    if (parseResult == null) {
        LOG.error("Parsing content failed!");
        return (-1);
    }
    // calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new Text(url)));
    if (LOG.isInfoEnabled()) {
        LOG.info("parsing: " + url);
        LOG.info("contentType: " + contentType);
        LOG.info("signature: " + StringUtil.toHexString(signature));
    }
    Parse parse = parseResult.get(turl);
    if (parse == null) {
        LOG.error("Failed to get parse from parse result");
        LOG.error("Available parses in parse result (by URL key):");
        for (Map.Entry<Text, Parse> entry : parseResult) {
            LOG.error("  " + entry.getKey());
        }
        LOG.error("Parse result does not contain a parse for URL to be checked:");
        LOG.error("  " + turl);
        return -1;
    }
    // call the scoring filters
    try {
        scfilters.passScoreAfterParsing(turl, content, parse);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")");
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
    for (Map.Entry<Text, Parse> entry : parseResult) {
        parse = entry.getValue();
        LOG.info("---------\nUrl\n---------------\n");
        System.out.print(entry.getKey());
        LOG.info("\n---------\nParseData\n---------\n");
        System.out.print(parse.getData().toString());
        if (dumpText) {
            LOG.info("---------\nParseText\n---------\n");
            System.out.print(parse.getText());
        }
    }
    return 0;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) HashMap(java.util.HashMap) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) Protocol(org.apache.nutch.protocol.Protocol) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

CrawlDatum (org.apache.nutch.crawl.CrawlDatum)66 Text (org.apache.hadoop.io.Text)60 Test (org.junit.Test)31 Inlinks (org.apache.nutch.crawl.Inlinks)25 Configuration (org.apache.hadoop.conf.Configuration)24 ParseData (org.apache.nutch.parse.ParseData)22 ParseImpl (org.apache.nutch.parse.ParseImpl)21 NutchDocument (org.apache.nutch.indexer.NutchDocument)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 Content (org.apache.nutch.protocol.Content)19 Parse (org.apache.nutch.parse.Parse)15 Metadata (org.apache.nutch.metadata.Metadata)14 ParseStatus (org.apache.nutch.parse.ParseStatus)14 ParseUtil (org.apache.nutch.parse.ParseUtil)13 Protocol (org.apache.nutch.protocol.Protocol)13 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)13 URL (java.net.URL)11 Outlink (org.apache.nutch.parse.Outlink)11 IOException (java.io.IOException)7 ArrayList (java.util.ArrayList)5