Search in sources :

Example 16 with ParseUtil

use of org.apache.nutch.parse.ParseUtil in project nutch by apache.

the class IndexingFiltersChecker method process.

protected int process(String url, StringBuilder output) throws Exception {
    if (normalizers != null) {
        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
    }
    LOG.info("fetching: " + url);
    CrawlDatum datum = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key = iter.next();
        String value = metadata.get(key);
        if (value == null)
            value = "";
        datum.getMetaData().put(new Text(key), new Text(value));
    }
    IndexingFilters indexers = new IndexingFilters(getConf());
    int maxRedirects = 3;
    ProtocolOutput protocolOutput = getProtocolOutput(url, datum);
    Text turl = new Text(url);
    // Following redirects and not reached maxRedirects?
    while (!protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects != 0) {
        String[] stuff = protocolOutput.getStatus().getArgs();
        url = stuff[0];
        if (normalizers != null) {
            url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
        }
        turl.set(url);
        // try again
        protocolOutput = getProtocolOutput(url, datum);
        maxRedirects--;
    }
    if (!protocolOutput.getStatus().isSuccess()) {
        output.append("Fetch failed with protocol status: " + protocolOutput.getStatus() + "\n");
        return 0;
    }
    Content content = protocolOutput.getContent();
    if (content == null) {
        output.append("No content for " + url + "\n");
        return 0;
    }
    String contentType = content.getContentType();
    if (contentType == null) {
        return -1;
    }
    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    }
    ScoringFilters scfilters = new ScoringFilters(getConf());
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, datum, content);
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", url, e);
    }
    LOG.info("parsing: {}", url);
    LOG.info("contentType: {}", contentType);
    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
    NutchDocument doc = new NutchDocument();
    doc.add("id", url);
    Text urlText = new Text(url);
    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    if (parse == null) {
        LOG.error("Failed to get parse from parse result");
        LOG.error("Available parses in parse result (by URL key):");
        for (Map.Entry<Text, Parse> entry : parseResult) {
            LOG.error("  " + entry.getKey());
        }
        LOG.error("Parse result does not contain a parse for URL to be checked:");
        LOG.error("  " + urlText);
        return -1;
    }
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
    parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
    String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
    doc.add("digest", digest);
    datum.setSignature(signature);
    // call the scoring filters
    try {
        scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", turl, e);
    }
    try {
        doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
        e.printStackTrace();
    }
    if (doc == null) {
        output.append("Document discarded by indexing filter\n");
        return 0;
    }
    for (String fname : doc.getFieldNames()) {
        List<Object> values = doc.getField(fname).getValues();
        if (values != null) {
            for (Object value : values) {
                String str = value.toString();
                int minText = dumpText ? str.length() : Math.min(100, str.length());
                output.append(fname + " :\t" + str.substring(0, minText) + "\n");
            }
        }
    }
    // For readability if keepClientCnxOpen
    output.append("\n");
    if (getConf().getBoolean("doIndex", false) && doc != null) {
        IndexWriters writers = new IndexWriters(getConf());
        writers.open(getConf(), "IndexingFilterChecker");
        writers.write(doc);
        writers.close();
    }
    return 0;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) ParseResult(org.apache.nutch.parse.ParseResult) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Content(org.apache.nutch.protocol.Content) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) HashMap(java.util.HashMap) Map(java.util.Map)

Example 17 with ParseUtil

use of org.apache.nutch.parse.ParseUtil in project nutch by apache.

the class TestFeedParser method testParseFetchChannel.

/**
 * Calls the {@link FeedParser} on a sample RSS file and checks that there are
 * 3 {@link ParseResult} entries including the below 2 links:
 * <ul>
 * <li>http://www-scf.usc.edu/~mattmann/</li>
 * <li>http://www.nutch.org</li>
 * </ul>
 *
 * @throws ProtocolNotFound
 *           If the {@link Protocol}Layer cannot be loaded (required to fetch
 *           the {@link Content} for the RSS file).
 * @throws ParseException
 *           If the {@link Parser}Layer cannot be loaded.
 */
@Test
public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    ParseResult parseResult;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        urlString = urlString.replace('\\', '/');
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
        Assert.assertEquals(3, parseResult.size());
        boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
        for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j.hasNext(); ) {
            Map.Entry<Text, Parse> entry = j.next();
            if (entry.getKey().toString().equals("http://www-scf.usc.edu/~mattmann/")) {
                hasLink1 = true;
            } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
                hasLink2 = true;
            } else if (entry.getKey().toString().equals(urlString)) {
                hasLink3 = true;
            }
            Assert.assertNotNull(entry.getValue());
            Assert.assertNotNull(entry.getValue().getData());
        }
        if (!hasLink1 || !hasLink2 || !hasLink3) {
            Assert.fail("Outlinks read from sample rss file are not correct!");
        }
    }
}
Also used : ParseResult(org.apache.nutch.parse.ParseResult) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) Map(java.util.Map) Test(org.junit.Test)

Aggregations

ParseUtil (org.apache.nutch.parse.ParseUtil)17 Parse (org.apache.nutch.parse.Parse)16 Content (org.apache.nutch.protocol.Content)15 Text (org.apache.hadoop.io.Text)13 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)13 Protocol (org.apache.nutch.protocol.Protocol)11 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)11 Configuration (org.apache.hadoop.conf.Configuration)10 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)10 Test (org.junit.Test)10 Metadata (org.apache.nutch.metadata.Metadata)4 Map (java.util.Map)2 Inlinks (org.apache.nutch.crawl.Inlinks)2 Outlink (org.apache.nutch.parse.Outlink)2 ParseData (org.apache.nutch.parse.ParseData)2 ParseException (org.apache.nutch.parse.ParseException)2 ParseResult (org.apache.nutch.parse.ParseResult)2 IOException (java.io.IOException)1 URL (java.net.URL)1 HashMap (java.util.HashMap)1