Search in sources :

Example 6 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestBasicIndexingFilter method testBasicIndexingFilter.

@Test
public void testBasicIndexingFilter() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.setInt("indexer.max.title.length", 10);
    conf.setBoolean("indexer.add.domain", true);
    conf.setInt("indexer.max.content.length", 20);
    BasicIndexingFilter filter = new BasicIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);
    Inlinks inlinks = new Inlinks();
    try {
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
    Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
    Assert.assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
    Assert.assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", doc.getField("url").getValues().get(0));
    Assert.assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
    Assert.assertEquals("test fetch time", new Date(100L), (Date) doc.getField("tstamp").getValues().get(0));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Date(java.util.Date) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) BasicIndexingFilter(org.apache.nutch.indexer.basic.BasicIndexingFilter) Test(org.junit.Test)

Example 7 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestJexlIndexingFilter method testAllowMatchingDocument.

@Test
public void testAllowMatchingDocument() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
    JexlIndexingFilter filter = new JexlIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);
    Inlinks inlinks = new Inlinks();
    doc.add("lang", "en");
    NutchDocument result = filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    Assert.assertNotNull(result);
    Assert.assertEquals(doc, result);
}
Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 8 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestJexlIndexingFilter method testBlockNotMatchingDocuments.

@Test
public void testBlockNotMatchingDocuments() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
    JexlIndexingFilter filter = new JexlIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);
    Inlinks inlinks = new Inlinks();
    doc.add("lang", "ru");
    NutchDocument result = filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    Assert.assertNull(result);
}
Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 9 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestLinksIndexingFilter method testIndexHostsOnlyAndFilterInlinks.

@Test
public void testIndexHostsOnlyAndFilterInlinks() throws Exception {
    conf = NutchConfiguration.create();
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    filter.setConf(conf);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com", "test"));
    inlinks.add(new Inlink("http://www.example.com", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Index only the host portion of the inlinks after filtering", new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) URL(java.net.URL) Test(org.junit.Test)

Example 10 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestLinksIndexingFilter method testIndexHostsOnlyAndFilterOutlinks.

@Test
public void testIndexHostsOnlyAndFilterOutlinks() throws Exception {
    conf = NutchConfiguration.create();
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    Outlink[] outlinks = generateOutlinks(true);
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
    Assert.assertEquals("Index only the host portion of the outlinks after filtering", new URL("http://www.test.com").getHost(), doc.getFieldValue("outlinks"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) URL(java.net.URL) Test(org.junit.Test)

Aggregations

ParseData (org.apache.nutch.parse.ParseData)37 ParseImpl (org.apache.nutch.parse.ParseImpl)29 Text (org.apache.hadoop.io.Text)23 ParseStatus (org.apache.nutch.parse.ParseStatus)23 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)22 Outlink (org.apache.nutch.parse.Outlink)22 Inlinks (org.apache.nutch.crawl.Inlinks)19 Metadata (org.apache.nutch.metadata.Metadata)19 Test (org.junit.Test)19 NutchDocument (org.apache.nutch.indexer.NutchDocument)16 Configuration (org.apache.hadoop.conf.Configuration)14 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)14 Parse (org.apache.nutch.parse.Parse)9 URL (java.net.URL)7 ArrayList (java.util.ArrayList)6 ParseResult (org.apache.nutch.parse.ParseResult)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 IOException (java.io.IOException)5 Inlink (org.apache.nutch.crawl.Inlink)5 Content (org.apache.nutch.protocol.Content)5