Search in sources :

Example 1 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestJexlIndexingFilter method testBlockNotMatchingDocuments.

@Test
public void testBlockNotMatchingDocuments() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.set("index.jexl.filter", "doc.lang=='en'");
    JexlIndexingFilter filter = new JexlIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);
    Inlinks inlinks = new Inlinks();
    doc.add("lang", "ru");
    NutchDocument result = filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    Assert.assertNull(result);
}
Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 2 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestLinksIndexingFilter method testFilterInlinks.

@Test
public void testFilterInlinks() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    filter.setConf(conf);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com", "test"));
    inlinks.add(new Inlink("http://www.example.com", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Filter inlinks, allow only those from a different host", "http://www.test.com", doc.getFieldValue("inlinks"));
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) Test(org.junit.Test)

Example 3 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestLinksIndexingFilter method testIndexHostsOnlyAndFilterOutlinks.

@Test
public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
    conf = NutchConfiguration.create();
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    Outlink[] outlinks = generateOutlinks(true);
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
    Assert.assertEquals("Index only the host portion of the outlinks after filtering", new URL("http://www.test.com").getHost(), doc.getFieldValue("outlinks"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) URL(java.net.URL) Test(org.junit.Test)

Example 4 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestLinksIndexingFilter method testIndexHostsOnlyAndFilterInlinks.

@Test
public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
    conf = NutchConfiguration.create();
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    filter.setConf(conf);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com", "test"));
    inlinks.add(new Inlink("http://www.example.com", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Index only the host portion of the inlinks after filtering", new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) URL(java.net.URL) Test(org.junit.Test)

Example 5 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class FeedIndexingFilter method filter.

/**
 * Extracts out the relevant fields:
 *
 * <ul>
 * <li>FEED_AUTHOR</li>
 * <li>FEED_TAGS</li>
 * <li>FEED_PUBLISHED</li>
 * <li>FEED_UPDATED</li>
 * <li>FEED</li>
 * </ul>
 *
 * And sends them to the {@link org.apache.nutch.indexer Indexer} for indexing within the Nutch index.
 */
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text url, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
    ParseData parseData = parse.getData();
    Metadata parseMeta = parseData.getParseMeta();
    String[] authors = parseMeta.getValues(Feed.FEED_AUTHOR);
    String[] tags = parseMeta.getValues(Feed.FEED_TAGS);
    String published = parseMeta.get(Feed.FEED_PUBLISHED);
    String updated = parseMeta.get(Feed.FEED_UPDATED);
    String feed = parseMeta.get(Feed.FEED);
    if (authors != null) {
        for (String author : authors) {
            doc.add(Feed.FEED_AUTHOR, author);
        }
    }
    if (tags != null) {
        for (String tag : tags) {
            doc.add(Feed.FEED_TAGS, tag);
        }
    }
    if (feed != null)
        doc.add(Feed.FEED, feed);
    if (published != null) {
        Date date = new Date(Long.parseLong(published));
        doc.add(PUBLISHED_DATE, date);
    }
    if (updated != null) {
        Date date = new Date(Long.parseLong(updated));
        doc.add(UPDATED_DATE, date);
    }
    return doc;
}
Also used : ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) Date(java.util.Date)

Aggregations

ParseData (org.apache.nutch.parse.ParseData)40 ParseImpl (org.apache.nutch.parse.ParseImpl)31 Text (org.apache.hadoop.io.Text)25 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)24 Outlink (org.apache.nutch.parse.Outlink)24 ParseStatus (org.apache.nutch.parse.ParseStatus)23 Test (org.junit.Test)22 Metadata (org.apache.nutch.metadata.Metadata)21 Inlinks (org.apache.nutch.crawl.Inlinks)20 Configuration (org.apache.hadoop.conf.Configuration)17 NutchDocument (org.apache.nutch.indexer.NutchDocument)17 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)17 Parse (org.apache.nutch.parse.Parse)10 Content (org.apache.nutch.protocol.Content)8 URL (java.net.URL)7 ArrayList (java.util.ArrayList)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 IOException (java.io.IOException)5 Inlink (org.apache.nutch.crawl.Inlink)5 ParseResult (org.apache.nutch.parse.ParseResult)5