Search in sources :

Example 1 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class TestLinksIndexingFilter method testFilterInlinks.

@Test
public void testFilterInlinks() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    filter.setConf(conf);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com", "test"));
    inlinks.add(new Inlink("http://www.example.com", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Filter inlinks, allow only those from a different host", "http://www.test.com", doc.getFieldValue("inlinks"));
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) Test(org.junit.Test)

Example 2 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class TestLinksIndexingFilter method testIndexHostsOnlyAndFilterOutlinks.

@Test
public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
    conf = NutchConfiguration.create();
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    Outlink[] outlinks = generateOutlinks(true);
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
    Assert.assertEquals("Index only the host portion of the outlinks after filtering", new URL("http://www.test.com").getHost(), doc.getFieldValue("outlinks"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) URL(java.net.URL) Test(org.junit.Test)

Example 3 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class TestLinksIndexingFilter method testIndexHostsOnlyAndFilterInlinks.

@Test
public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
    conf = NutchConfiguration.create();
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    filter.setConf(conf);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com", "test"));
    inlinks.add(new Inlink("http://www.example.com", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Index only the host portion of the inlinks after filtering", new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) URL(java.net.URL) Test(org.junit.Test)

Example 4 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class FeedParser method getParse.

/**
 * Parses the given feed and extracts out and parsers all linked items within
 * the feed, using the underlying ROME feed parsing library.
 *
 * @param content
 *          A {@link Content} object representing the feed that is being
 *          parsed by this {@link Parser}.
 *
 * @return A {@link ParseResult} containing all {@link Parse}d feeds that were
 *         present in the feed file that this {@link Parser} dealt with.
 */
@Override
public ParseResult getParse(Content content) {
    SyndFeed feed = null;
    ParseResult parseResult = new ParseResult(content.getUrl());
    EncodingDetector detector = new EncodingDetector(conf);
    detector.autoDetectClues(content, true);
    String encoding = detector.guessEncoding(content, defaultEncoding);
    try {
        InputSource input = new InputSource(new ByteArrayInputStream(content.getContent()));
        input.setEncoding(encoding);
        SyndFeedInput feedInput = new SyndFeedInput();
        feed = feedInput.build(input);
    } catch (Exception e) {
        // return empty parse
        LOG.warn("Parse failed: url: " + content.getUrl() + ", exception: " + StringUtils.stringifyException(e));
        return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }
    String feedLink = feed.getLink();
    try {
        feedLink = normalizers.normalize(feedLink, URLNormalizers.SCOPE_OUTLINK);
        if (feedLink != null)
            feedLink = filters.filter(feedLink);
    } catch (Exception e) {
        feedLink = null;
    }
    List<?> entries = feed.getEntries();
    for (Object entry : entries) {
        addToMap(parseResult, feed, feedLink, (SyndEntry) entry, content);
    }
    String feedDesc = stripTags(feed.getDescriptionEx());
    String feedTitle = stripTags(feed.getTitleEx());
    parseResult.put(content.getUrl(), new ParseText(feedDesc), new ParseData(new ParseStatus(ParseStatus.SUCCESS), feedTitle, new Outlink[0], content.getMetadata()));
    return parseResult;
}
Also used : Outlink(org.apache.nutch.parse.Outlink) InputSource(org.xml.sax.InputSource) ParseResult(org.apache.nutch.parse.ParseResult) ParseText(org.apache.nutch.parse.ParseText) SyndFeed(com.rometools.rome.feed.synd.SyndFeed) ParseStatus(org.apache.nutch.parse.ParseStatus) EncodingDetector(org.apache.nutch.util.EncodingDetector) ByteArrayInputStream(java.io.ByteArrayInputStream) ParseData(org.apache.nutch.parse.ParseData) SyndFeedInput(com.rometools.rome.io.SyndFeedInput)

Example 5 with ParseStatus

use of org.apache.nutch.parse.ParseStatus in project nutch by apache.

the class TestIndexingFilters method testNonExistingIndexingFilter.

/**
 * Test behaviour when defined filter does not exist.
 *
 * @throws IndexingException
 */
@Test
public void testNonExistingIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");
    String class1 = "NonExistingFilter";
    String class2 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters = new IndexingFilters(conf);
    filters.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Aggregations

ParseStatus (org.apache.nutch.parse.ParseStatus)25 ParseData (org.apache.nutch.parse.ParseData)23 ParseImpl (org.apache.nutch.parse.ParseImpl)21 Text (org.apache.hadoop.io.Text)16 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)16 Outlink (org.apache.nutch.parse.Outlink)16 Inlinks (org.apache.nutch.crawl.Inlinks)14 Test (org.junit.Test)12 NutchDocument (org.apache.nutch.indexer.NutchDocument)11 Metadata (org.apache.nutch.metadata.Metadata)10 URL (java.net.URL)7 IOException (java.io.IOException)6 Configuration (org.apache.hadoop.conf.Configuration)6 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 MalformedURLException (java.net.MalformedURLException)5 ArrayList (java.util.ArrayList)5 Parse (org.apache.nutch.parse.Parse)5 ParseText (org.apache.nutch.parse.ParseText)5 Inlink (org.apache.nutch.crawl.Inlink)4