Search in sources :

Example 16 with Protocol

use of org.apache.nutch.protocol.Protocol in project nutch by apache.

the class TestFeedParser method testParseFetchChannel.

/**
 * Calls the {@link FeedParser} on a sample RSS file and checks that there are
 * 3 {@link ParseResult} entries including the below 2 links:
 * <ul>
 * <li>http://www-scf.usc.edu/~mattmann/</li>
 * <li>http://www.nutch.org</li>
 * </ul>
 *
 * @throws ProtocolNotFound
 *           If the {@link Protocol}Layer cannot be loaded (required to fetch
 *           the {@link Content} for the RSS file).
 * @throws ParseException
 *           If the {@link Parser}Layer cannot be loaded.
 */
@Test
public void testParseFetchChannel() throws ProtocolNotFound, ParseException {
    String urlString;
    Protocol protocol;
    Content content;
    ParseResult parseResult;
    Configuration conf = NutchConfiguration.create();
    for (int i = 0; i < sampleFiles.length; i++) {
        urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
        urlString = urlString.replace('\\', '/');
        protocol = new ProtocolFactory(conf).getProtocol(urlString);
        content = protocol.getProtocolOutput(new Text(urlString), new CrawlDatum()).getContent();
        parseResult = new ParseUtil(conf).parseByExtensionId("feed", content);
        Assert.assertEquals(3, parseResult.size());
        boolean hasLink1 = false, hasLink2 = false, hasLink3 = false;
        for (Iterator<Map.Entry<Text, Parse>> j = parseResult.iterator(); j.hasNext(); ) {
            Map.Entry<Text, Parse> entry = j.next();
            if (entry.getKey().toString().equals("http://www-scf.usc.edu/~mattmann/")) {
                hasLink1 = true;
            } else if (entry.getKey().toString().equals("http://www.nutch.org/")) {
                hasLink2 = true;
            } else if (entry.getKey().toString().equals(urlString)) {
                hasLink3 = true;
            }
            Assert.assertNotNull(entry.getValue());
            Assert.assertNotNull(entry.getValue().getData());
        }
        if (!hasLink1 || !hasLink2 || !hasLink3) {
            Assert.fail("Outlinks read from sample rss file are not correct!");
        }
    }
}
Also used : ParseResult(org.apache.nutch.parse.ParseResult) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) Protocol(org.apache.nutch.protocol.Protocol) Map(java.util.Map) Test(org.junit.Test)

Aggregations

Text (org.apache.hadoop.io.Text)16 Protocol (org.apache.nutch.protocol.Protocol)16 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)13 Content (org.apache.nutch.protocol.Content)13 Parse (org.apache.nutch.parse.Parse)11 ParseUtil (org.apache.nutch.parse.ParseUtil)11 Configuration (org.apache.hadoop.conf.Configuration)7 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)7 Test (org.junit.Test)7 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)4 IOException (java.io.IOException)2 Map (java.util.Map)2 Metadata (org.apache.nutch.metadata.Metadata)2 BaseRobotRules (crawlercommons.robots.BaseRobotRules)1 File (java.io.File)1 FileOutputStream (java.io.FileOutputStream)1 MalformedURLException (java.net.MalformedURLException)1 URL (java.net.URL)1 HashMap (java.util.HashMap)1