Search in sources :

Example 31 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestFeedParser method testParseFetchChannel.

/**
 * Calls the {@link FeedParser} on a sample RSS file and checks that there are
 * 3 {@link ParseResult} entries including the below 2 links:
 * <ul>
 * <li>http://www-scf.usc.edu/~mattmann/</li>
 * <li>http://www.nutch.org</li>
 * </ul>
 *
 * @throws ProtocolNotFound
 *           If the {@link Protocol}Layer cannot be loaded (required to fetch
 *           the {@link Content} for the RSS file).
 * @throws ParseException
 *           If the {@link Parser}Layer cannot be loaded.
 */
@Test
public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    ParseResult parseResult;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        urlString = urlString.replace('\\', '/');
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
        Assert.assertEquals(3, parseResult.size());
        boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
        for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j.hasNext(); ) {
            Map.Entry<Text, Parse> entry = j.next();
            if (entry.getKey().toString().equals("http://www-scf.usc.edu/~mattmann/")) {
                hasLink1 = true;
            } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
                hasLink2 = true;
            } else if (entry.getKey().toString().equals(urlString)) {
                hasLink3 = true;
            }
            Assert.assertNotNull(entry.getValue());
            Assert.assertNotNull(entry.getValue().getData());
        }
        if (!hasLink1 || !hasLink2 || !hasLink3) {
            Assert.fail("Outlinks read from sample rss file are not correct!");
        }
    }
}
Also used : ParseResult(org.apache.nutch.parse.ParseResult) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) Map(java.util.Map) Test(org.junit.Test)

Example 32 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class TestIndexReplace method parseAndFilterFile.

/**
 * Run a test file through the Nutch parser and index filters.
 *
 * @param fileName
 * @param conf
 * @return the Nutch document with the replace indexer applied
 */
public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
    NutchDocument doc = new NutchDocument();
    BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
    basicIndexer.setConf(conf);
    Assert.assertNotNull(basicIndexer);
    MetadataIndexer metaIndexer = new MetadataIndexer();
    metaIndexer.setConf(conf);
    Assert.assertNotNull(basicIndexer);
    ReplaceIndexer replaceIndexer = new ReplaceIndexer();
    replaceIndexer.setConf(conf);
    Assert.assertNotNull(replaceIndexer);
    try {
        String urlString = "file:" + sampleDir + fileSeparator + fileName;
        Text text = new Text(urlString);
        CrawlDatum crawlDatum = new CrawlDatum();
        Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
        Content content = protocol.getProtocolOutput(text, crawlDatum).getContent();
        Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        crawlDatum.setFetchTime(100L);
        Inlinks inlinks = new Inlinks();
        doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
        doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
        doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.toString());
    }
    return doc;
}
Also used : NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) MetadataIndexer(org.apache.nutch.indexer.metadata.MetadataIndexer) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) BasicIndexingFilter(org.apache.nutch.indexer.basic.BasicIndexingFilter) Protocol(org.apache.nutch.protocol.Protocol)

Example 33 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class IndexingFiltersChecker method process.

protected int process(String url, StringBuilder output) throws Exception {
    if (normalizers != null) {
        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
    }
    LOG.info("fetching: " + url);
    CrawlDatum datum = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key = iter.next();
        String value = metadata.get(key);
        if (value == null)
            value = "";
        datum.getMetaData().put(new Text(key), new Text(value));
    }
    int maxRedirects = getConf().getInt("http.redirect.max", 3);
    if (followRedirects) {
        if (maxRedirects == 0) {
            LOG.info("Following max. 3 redirects (ignored http.redirect.max == 0)");
            maxRedirects = 3;
        } else {
            LOG.info("Following max. {} redirects", maxRedirects);
        }
    }
    ProtocolOutput protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
    Text turl = new Text(url);
    // Following redirects and not reached maxRedirects?
    int numRedirects = 0;
    while (protocolOutput != null && !protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) {
        String[] stuff = protocolOutput.getStatus().getArgs();
        url = stuff[0];
        LOG.info("Follow redirect to {}", url);
        if (normalizers != null) {
            url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
        }
        turl.set(url);
        // try again
        protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
        numRedirects++;
    }
    if (checkRobotsTxt && protocolOutput == null) {
        System.err.println("Fetch disallowed by robots.txt");
        return -1;
    }
    if (!protocolOutput.getStatus().isSuccess()) {
        System.err.println("Fetch failed with protocol status: " + protocolOutput.getStatus());
        if (protocolOutput.getStatus().isRedirect()) {
            System.err.println("Redirect(s) not handled due to configuration.");
            System.err.println("Max Redirects to handle per config: " + maxRedirects);
            System.err.println("Number of Redirects handled: " + numRedirects);
        }
        return -1;
    }
    Content content = protocolOutput.getContent();
    if (content == null) {
        output.append("No content for " + url + "\n");
        return 0;
    }
    String contentType = content.getContentType();
    if (contentType == null) {
        LOG.error("Failed to determine content type!");
        return -1;
    }
    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    }
    ScoringFilters scfilters = new ScoringFilters(getConf());
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, datum, content);
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", url, e);
    }
    LOG.info("parsing: {}", url);
    LOG.info("contentType: {}", contentType);
    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
    NutchDocument doc = new NutchDocument();
    doc.add("id", url);
    Text urlText = new Text(url);
    Inlinks inlinks = null;
    Parse parse = parseResult.get(urlText);
    if (parse == null) {
        LOG.error("Failed to get parse from parse result");
        LOG.error("Available parses in parse result (by URL key):");
        for (Map.Entry<Text, Parse> entry : parseResult) {
            LOG.error("  " + entry.getKey());
        }
        LOG.error("Parse result does not contain a parse for URL to be checked:");
        LOG.error("  " + urlText);
        return -1;
    }
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parse);
    parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
    String digest = parse.getData().getContentMeta().get(Nutch.SIGNATURE_KEY);
    doc.add("digest", digest);
    datum.setSignature(signature);
    // call the scoring filters
    try {
        scfilters.passScoreAfterParsing(turl, content, parseResult.get(turl));
    } catch (Exception e) {
        LOG.warn("Couldn't pass score, url {} ({})", turl, e);
    }
    IndexingFilters indexers = new IndexingFilters(getConf());
    try {
        doc = indexers.filter(doc, parse, urlText, datum, inlinks);
    } catch (IndexingException e) {
        e.printStackTrace();
    }
    if (doc == null) {
        output.append("Document discarded by indexing filter\n");
        return 0;
    }
    for (String fname : doc.getFieldNames()) {
        List<Object> values = doc.getField(fname).getValues();
        if (values != null) {
            for (Object value : values) {
                String str = value.toString();
                int minText = dumpText ? str.length() : Math.min(100, str.length());
                output.append(fname + " :\t" + str.substring(0, minText) + "\n");
            }
        }
    }
    // For readability if keepClientCnxOpen
    output.append("\n");
    if (doIndex) {
        IndexWriters writers = IndexWriters.get(getConf());
        writers.open(getConf(), "IndexingFilterChecker");
        writers.write(doc);
        writers.close();
    }
    return 0;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) ParseResult(org.apache.nutch.parse.ParseResult) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Content(org.apache.nutch.protocol.Content) ScoringFilters(org.apache.nutch.scoring.ScoringFilters) HashMap(java.util.HashMap) Map(java.util.Map)

Example 34 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class ParserChecker method process.

protected int process(String url, StringBuilder output) throws Exception {
    if (normalizers != null) {
        url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
    }
    LOG.info("fetching: " + url);
    CrawlDatum datum = new CrawlDatum();
    Iterator<String> iter = metadata.keySet().iterator();
    while (iter.hasNext()) {
        String key = iter.next();
        String value = metadata.get(key);
        if (value == null)
            value = "";
        datum.getMetaData().put(new Text(key), new Text(value));
    }
    int maxRedirects = getConf().getInt("http.redirect.max", 3);
    if (followRedirects) {
        if (maxRedirects == 0) {
            LOG.info("Following max. 3 redirects (ignored http.redirect.max == 0)");
            maxRedirects = 3;
        } else {
            LOG.info("Following max. {} redirects", maxRedirects);
        }
    }
    ProtocolOutput protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
    Text turl = new Text(url);
    // Following redirects and not reached maxRedirects?
    int numRedirects = 0;
    while (protocolOutput != null && !protocolOutput.getStatus().isSuccess() && followRedirects && protocolOutput.getStatus().isRedirect() && maxRedirects >= numRedirects) {
        String[] stuff = protocolOutput.getStatus().getArgs();
        url = stuff[0];
        LOG.info("Follow redirect to {}", url);
        if (normalizers != null) {
            url = normalizers.normalize(url, URLNormalizers.SCOPE_DEFAULT);
        }
        turl.set(url);
        // try again
        protocolOutput = getProtocolOutput(url, datum, checkRobotsTxt);
        numRedirects++;
    }
    if (checkRobotsTxt && protocolOutput == null) {
        System.err.println("Fetch disallowed by robots.txt");
        return -1;
    }
    if (!protocolOutput.getStatus().isSuccess()) {
        System.err.println("Fetch failed with protocol status: " + protocolOutput.getStatus());
        if (protocolOutput.getStatus().isRedirect()) {
            System.err.println("Redirect(s) not handled due to configuration.");
            System.err.println("Max Redirects to handle per config: " + maxRedirects);
            System.err.println("Number of Redirects handled: " + numRedirects);
        }
        return -1;
    }
    Content content = protocolOutput.getContent();
    if (content == null) {
        output.append("No content for " + url + "\n");
        return 0;
    }
    String contentType;
    if (forceAsContentType != null) {
        content.setContentType(forceAsContentType);
        contentType = forceAsContentType;
    } else {
        contentType = content.getContentType();
    }
    if (contentType == null) {
        LOG.error("Failed to determine content type!");
        return -1;
    }
    // store the guessed content type in the crawldatum
    datum.getMetaData().put(new Text(Metadata.CONTENT_TYPE), new Text(contentType));
    if (ParseSegment.isTruncated(content)) {
        LOG.warn("Content is truncated, parse may fail!");
    }
    // call the scoring filters
    try {
        scfilters.passScoreBeforeParsing(turl, datum, content);
    } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
            LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")");
            LOG.warn(StringUtils.stringifyException(e));
        }
    }
    ParseResult parseResult = new ParseUtil(getConf()).parse(content);
    if (parseResult == null) {
        LOG.error("Parsing content failed!");
        return (-1);
    }
    // calculate the signature
    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new Text(url)));
    if (LOG.isInfoEnabled()) {
        LOG.info("parsing: {}", url);
        LOG.info("contentType: {}", contentType);
        LOG.info("signature: {}", StringUtil.toHexString(signature));
    }
    for (Map.Entry<Text, Parse> entry : parseResult) {
        turl = entry.getKey();
        Parse parse = entry.getValue();
        // call the scoring filters
        try {
            scfilters.passScoreAfterParsing(turl, content, parse);
        } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")");
                LOG.warn(StringUtils.stringifyException(e));
            }
        }
        output.append(turl).append("\n");
        output.append(parse.getData()).append("\n");
        if (dumpText) {
            output.append(parse.getText());
        }
    }
    return 0;
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Content(org.apache.nutch.protocol.Content) HashMap(java.util.HashMap) Map(java.util.Map)

Example 35 with Content

use of org.apache.nutch.protocol.Content in project nutch by apache.

the class Foo method getProtocolOutput.

/**
 * This is a dummy implementation only. So what we will do is return this
 * structure:
 *
 * <pre>
 * foo://example.com - will contain one directory and one file
 * foo://example.com/a - directory, will contain two files
 * foo://example.com/a/aa.txt - text file
 * foo://example.com/a/ab.txt - text file
 * foo://example.com/a.txt - text file
 * </pre>
 */
@Override
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
    LOG.debug("getProtocolOutput({}, {})", url, datum);
    try {
        String urlstr = String.valueOf(url);
        URL u = new URL(urlstr);
        URL base = new URL(u, ".");
        byte[] bytes = new byte[0];
        String contentType = "foo/something";
        ProtocolStatus status = ProtocolStatus.STATUS_GONE;
        switch(urlstr) {
            case "foo://example.com":
            case "foo://example.com/":
                {
                    String time = HttpDateFormat.toString(System.currentTimeMillis());
                    contentType = "text/html";
                    StringBuffer sb = new StringBuffer();
                    sb.append("<html><head>");
                    sb.append("<title>Index of /</title></head>\n");
                    sb.append("<body><h1>Index of /</h1><pre>\n");
                    // add directory
                    sb.append("<a href='a/" + "'>a/</a>\t" + time + "\t-\n");
                    // add file
                    sb.append("<a href='a.txt'>a.txt</a>\t" + time + "\t" + 0 + "\n");
                    sb.append("</pre></html></body>");
                    bytes = sb.toString().getBytes();
                    status = ProtocolStatus.STATUS_SUCCESS;
                    break;
                }
            case "foo://example.com/a/":
                {
                    String time = HttpDateFormat.toString(System.currentTimeMillis());
                    contentType = "text/html";
                    StringBuffer sb = new StringBuffer();
                    sb.append("<html><head>");
                    sb.append("<title>Index of /a/</title></head>\n");
                    sb.append("<body><h1>Index of /a/</h1><pre>\n");
                    // add file
                    sb.append("<a href='aa.txt'>aa.txt</a>\t" + time + "\t" + 0 + "\n");
                    // add file
                    sb.append("<a href='ab.txt'>ab.txt</a>\t" + time + "\t" + 0 + "\n");
                    sb.append("</pre></html></body>");
                    bytes = sb.toString().getBytes();
                    status = ProtocolStatus.STATUS_SUCCESS;
                    break;
                }
            case "foo://example.com/a.txt":
            case "foo://example.com/a/aa.txt":
            case "foo://example.com/a/ab.txt":
                {
                    contentType = "text/plain";
                    bytes = "In publishing and graphic design, lorem ipsum is a filler text or greeking commonly used to demonstrate the textual elements of a graphic document or visual presentation. Replacing meaningful content with placeholder text allows designers to design the form of the content before the content itself has been produced.".getBytes();
                    status = ProtocolStatus.STATUS_SUCCESS;
                    break;
                }
            default:
                LOG.warn("Unknown url '{}'. This dummy implementation only supports 'foo://example.com'", url);
                // all our default values are set for URLs that do not exist.
                break;
        }
        Metadata metadata = new Metadata();
        Content content = new Content(String.valueOf(url), String.valueOf(base), bytes, contentType, metadata, getConf());
        return new ProtocolOutput(content, status);
    } catch (MalformedURLException mue) {
        LOG.error("Could not retrieve {}", url);
        LOG.error("", mue);
        // claim STATUS_GONE to tell nutch to never ever re-request this URL
        return new ProtocolOutput(null, ProtocolStatus.STATUS_GONE);
    }
}
Also used : ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) MalformedURLException(java.net.MalformedURLException) Content(org.apache.nutch.protocol.Content) Metadata(org.apache.nutch.metadata.Metadata) URL(java.net.URL) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Aggregations

Content (org.apache.nutch.protocol.Content)51 Text (org.apache.hadoop.io.Text)30 Parse (org.apache.nutch.parse.Parse)29 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)27 Configuration (org.apache.hadoop.conf.Configuration)23 Metadata (org.apache.nutch.metadata.Metadata)23 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)22 ParseUtil (org.apache.nutch.parse.ParseUtil)20 Test (org.junit.Test)19 Protocol (org.apache.nutch.protocol.Protocol)17 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)16 ParseData (org.apache.nutch.parse.ParseData)8 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)8 ParseResult (org.apache.nutch.parse.ParseResult)7 URL (java.net.URL)6 File (java.io.File)5 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 Outlink (org.apache.nutch.parse.Outlink)5 HashMap (java.util.HashMap)4