Search in sources :

Example 51 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class ParserChecker method process.

protected int process(String url, StringBuilder output) throws Exception {
    if (normalizers != null) {
        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
    }
    LOG.info("fetching: " + url);
    CrawlDatum datum = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key = iter.next();
        String value = metadata.get(key);
        if (value == null)
            value = "";
        datum.getMetaData().put(new Text(key), new Text(value));
    }
    int maxRedirects = getConf().getInt("http.redirect.max", 3);
    if (followRedirects) {
        if (maxRedirects == 0) {
            LOG.info("Following max. 3 redirects (ignored http.redirect.max == 0)");
            maxRedirects = 3;
        } else {
            LOG.info("Following max. {} redirects", maxRedirects);
        }
    }
    ProtocolOutput protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
    Text turl = new Text(url);
    // Following redirects and not reached maxRedirects?
    int numRedirects = 0;
    while (protocolOutput != null && !protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) {
        String[] stuff = protocolOutput.getStatus().getArgs();
        url = stuff[0];
        LOG.info("Follow redirect to {}", url);
        if (normalizers != null) {
            url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
        }
        turl.set(url);
        // try again
        protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
        numRedirects++;
    }
    if (checkRobotsTxt && protocolOutput == null) {
        System.err.println("Fetch disallowed by robots.txt");
        return -1;
    }
    if (!protocolOutput.getStatus().isSuccess()) {
        System.err.println("Fetch failed with protocol status: " + protocolOutput.getStatus());
        if (protocolOutput.getStatus().isRedirect()) {
            System.err.println("Redirect(s) not handled due to configuration.");
            System.err.println("Max Redirects to handle per config: " + maxRedirects);
            System.err.println("Number of Redirects handled: " + numRedirects);
        }
        return -1;
    }
    Content content = protocolOutput.getContent();
    if (content == null) {
        output.append("No content for " + url + "\n");
        return 0;
    }
    String contentType;
    if (forceAsContentType != null) {
        content.setContentType(forceAsContentType);
        contentType = forceAsContentType;
    } else {
        contentType = content.getContentType();
    }
    if (contentType == null) {
        LOG.error("Failed to determine content type!");
        return -1;
    }
    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    }
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, datum, content);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")");
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
    if (parseResult == null) {
        LOG.error("Parsing content failed!");
        return (-1);
    }
    // calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new Text(url)));
    if (LOG.isInfoEnabled()) {
        LOG.info("parsing: {}", url);
        LOG.info("contentType: {}", contentType);
        LOG.info("signature: {}", StringUtil.toHexString(signature));
    }
    for (Map.Entry<Text, Parse> entry : parseResult) {
        turl = entry.getKey();
        Parse parse = entry.getValue();
        // call the scoring filters
        try {
            scfilters.passScoreAfterParsing(turl, content, parse);
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")");
                LOG.warn(StringUtils.stringifyException(e));
            }
        }
        output.append(turl).append("\n");
        output.append(parse.getData()).append("\n");
        if (dumpText) {
            output.append(parse.getText());
        }
    }
    return 0;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Content(org.apache.nutch.protocol.Content) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

Content (org.apache.nutch.protocol.Content)51 Text (org.apache.hadoop.io.Text)30 Parse (org.apache.nutch.parse.Parse)29 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)27 Configuration (org.apache.hadoop.conf.Configuration)23 Metadata (org.apache.nutch.metadata.Metadata)23 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)22 ParseUtil (org.apache.nutch.parse.ParseUtil)20 Test (org.junit.Test)19 Protocol (org.apache.nutch.protocol.Protocol)17 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)16 ParseData (org.apache.nutch.parse.ParseData)8 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)8 ParseResult (org.apache.nutch.parse.ParseResult)7 URL (java.net.URL)6 File (java.io.File)5 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 Outlink (org.apache.nutch.parse.Outlink)5 HashMap (java.util.HashMap)4