Search in sources :

Example 11 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class IndexingFiltersChecker method process.

protected int process(String url, StringBuilder output) throws Exception {
    if (normalizers != null) {
        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
    }
    LOG.info("fetching: " + url);
    CrawlDatum datum = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key = iter.next();
        String value = metadata.get(key);
        if (value == null)
            value = "";
        datum.getMetaData().put(new Text(key), new Text(value));
    }
    IndexingFilters indexers = new IndexingFilters(getConf());
    int maxRedirects = 3;
    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
    Text turl = new Text(url);
    // Following redirects and not reached maxRedirects?
    while (!protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects != 0) {
        String[] stuff = protocolOutput.getStatus().getArgs();
        url = stuff[0];
        if (normalizers != null) {
            url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
        }
        turl.set(url);
        // try again
        protocolOutput = getProtocolOutput(url, datum);
        maxRedirects--;
    }
    if (!protocolOutput.getStatus().isSuccess()) {
        output.append("Fetch failed with protocol status: " + protocolOutput.getStatus() + "\n");
        return 0;
    }
    Content content = protocolOutput.getContent();
    if (content == null) {
        output.append("No content for " + url + "\n");
        return 0;
    }
    String contentType = content.getContentType();
    if (contentType == null) {
        return -1;
    }
    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    }
    ScoringFilters scfilters = new ScoringFilters(getConf());
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, datum, content);
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", url, e);
    }
    LOG.info("parsing: {}", url);
    LOG.info("contentType: {}", contentType);
    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
    NutchDocument doc = new NutchDocument();
    doc.add("id", url);
    Text urlText = new Text(url);
    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    if (parse == null) {
        LOG.error("Failed to get parse from parse result");
        LOG.error("Available parses in parse result (by URL key):");
        for (Map.Entry<Text, Parse> entry : parseResult) {
            LOG.error("  " + entry.getKey());
        }
        LOG.error("Parse result does not contain a parse for URL to be checked:");
        LOG.error("  " + urlText);
        return -1;
    }
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
    parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
    String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
    doc.add("digest", digest);
    datum.setSignature(signature);
    // call the scoring filters
    try {
        scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", turl, e);
    }
    try {
        doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
        e.printStackTrace();
    }
    if (doc == null) {
        output.append("Document discarded by indexing filter\n");
        return 0;
    }
    for (String fname : doc.getFieldNames()) {
        List<Object> values = doc.getField(fname).getValues();
        if (values != null) {
            for (Object value : values) {
                String str = value.toString();
                int minText = dumpText ? str.length() : Math.min(100, str.length());
                output.append(fname + " :\t" + str.substring(0, minText) + "\n");
            }
        }
    }
    // For readability if keepClientCnxOpen
    output.append("\n");
    if (getConf().getBoolean("doIndex", false) && doc != null) {
        IndexWriters writers = new IndexWriters(getConf());
        writers.open(getConf(), "IndexingFilterChecker");
        writers.write(doc);
        writers.close();
    }
    return 0;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) ParseResult(org.apache.nutch.parse.ParseResult) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Content(org.apache.nutch.protocol.Content) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) HashMap(java.util.HashMap) Map(java.util.Map)

Example 12 with ProtocolOutput

use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.

the class IndexingFiltersChecker method getProtocolOutput.

protected ProtocolOutput getProtocolOutput(String url, CrawlDatum datum) throws Exception {
    ProtocolFactory factory = new ProtocolFactory(getConf());
    Protocol protocol = factory.getProtocol(url);
    Text turl = new Text(url);
    ProtocolOutput protocolOutput = protocol.getProtocolOutput(turl, datum);
    return protocolOutput;
}
Also used : ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Text(org.apache.hadoop.io.Text) Protocol(org.apache.nutch.protocol.Protocol)

Aggregations

ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)12 Text (org.apache.hadoop.io.Text)11 Content (org.apache.nutch.protocol.Content)7 URL (java.net.URL)6 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)6 ProtocolStatus (org.apache.nutch.protocol.ProtocolStatus)5 Protocol (org.apache.nutch.protocol.Protocol)4 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)3 BaseRobotRules (crawlercommons.robots.BaseRobotRules)2 IOException (java.io.IOException)2 MalformedURLException (java.net.MalformedURLException)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 Response (org.apache.nutch.net.protocols.Response)2 ScoringFilters (org.apache.nutch.scoring.ScoringFilters)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 IntWritable (org.apache.hadoop.io.IntWritable)1 Inlinks (org.apache.nutch.crawl.Inlinks)1 URLFilterException (org.apache.nutch.net.URLFilterException)1 Parse (org.apache.nutch.parse.Parse)1