Search in sources :

Example 1 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TestAny23IndexingFilter method testAny23TriplesFields.

@Test
public void testAny23TriplesFields() throws Exception {
    Configuration conf = NutchConfiguration.create();
    Any23IndexingFilter filter = new Any23IndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, "The Foo Page", new Outlink[] {}, new Metadata());
    ParseImpl parse = new ParseImpl("test page", parseData);
    String[] triples = new String[] { "<http://dbpedia.org/resource/Z\u00FCrich> <http://www.w3.org/2002/07/owl#sameAs> <http://rdf.freebase.com/ns/m.08966> .", "<http://dbpedia.org/resource/Z\u00FCrich> <http://dbpedia.org/property/yearHumidity> \"77\" .", "<http://dbpedia.org/resource/Z\u00FCrich> <http://www.w3.org/2000/01/rdf-schema#label> \"Zurique\"@pt ." };
    for (String triple : triples) {
        parse.getData().getParseMeta().add(Any23ParseFilter.ANY23_TRIPLES, triple);
    }
    try {
        doc = filter.filter(doc, parse, new Text("http://nutch.apache.org/"), new CrawlDatum(), new Inlinks());
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    List<Object> docTriples = doc.getField(Any23IndexingFilter.STRUCTURED_DATA).getValues();
    Assert.assertEquals(docTriples.size(), triples.length);
    Object triple = docTriples.get(0);
    Assert.assertTrue(triple instanceof Map<?, ?>);
    @SuppressWarnings("unchecked") Map<String, String> structuredData = (Map<String, String>) triple;
    Assert.assertEquals(structuredData.get("node"), "<http://dbpedia.org/resource/Z\u00FCrich>");
    Assert.assertEquals(structuredData.get("key"), "<http://www.w3.org/2002/07/owl#sameAs>");
    Assert.assertEquals(structuredData.get("short_key"), "sameAs");
    Assert.assertEquals(structuredData.get("value"), "<http://rdf.freebase.com/ns/m.08966>");
    triple = docTriples.get(1);
    Assert.assertTrue(triple instanceof Map<?, ?>);
    structuredData = (Map<String, String>) triple;
    Assert.assertEquals(structuredData.get("node"), "<http://dbpedia.org/resource/Z\u00FCrich>");
    Assert.assertEquals(structuredData.get("key"), "<http://dbpedia.org/property/yearHumidity>");
    Assert.assertEquals(structuredData.get("short_key"), "yearHumidity");
    Assert.assertEquals(structuredData.get("value"), "\"77\"");
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) Map(java.util.Map) Test(org.junit.Test)

Example 2 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TestAnchorIndexingFilter method testDeduplicateAnchor.

@Test
public void testDeduplicateAnchor() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("anchorIndexingFilter.deduplicate", true);
    AnchorIndexingFilter filter = new AnchorIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://test1.com/", "text1"));
    inlinks.add(new Inlink("http://test2.com/", "text2"));
    inlinks.add(new Inlink("http://test3.com/", "text2"));
    try {
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertTrue("test if there is an anchor at all", doc.getFieldNames().contains("anchor"));
    Assert.assertEquals("test dedup, we expect 2", 2, doc.getField("anchor").getValues().size());
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) Test(org.junit.Test)

Example 3 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TestBasicIndexingFilter method testBasicIndexingFilter.

@Test
public void testBasicIndexingFilter() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.setInt("indexer.max.title.length", 10);
    conf.setBoolean("indexer.add.domain", true);
    conf.setInt("indexer.max.content.length", 20);
    BasicIndexingFilter filter = new BasicIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);
    Inlinks inlinks = new Inlinks();
    try {
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertEquals("test title, expect \"The Foo Pa\"", "The Foo Pa", doc.getField("title").getValues().get(0));
    Assert.assertEquals("test domain, expect \"apache.org\"", "apache.org", doc.getField("domain").getValues().get(0));
    Assert.assertEquals("test host, expect \"nutch.apache.org\"", "nutch.apache.org", doc.getField("host").getValues().get(0));
    Assert.assertEquals("test url, expect \"http://nutch.apache.org/index.html\"", "http://nutch.apache.org/index.html", doc.getField("url").getValues().get(0));
    Assert.assertEquals("test content", "this is a sample foo", doc.getField("content").getValues().get(0));
    Assert.assertEquals("test fetch time", new Date(100L), (Date) doc.getField("tstamp").getValues().get(0));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Date(java.util.Date) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) BasicIndexingFilter(org.apache.nutch.indexer.basic.BasicIndexingFilter) Test(org.junit.Test)

Example 4 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TestJexlIndexingFilter method testAllowMatchingDocument.

@Test
public void testAllowMatchingDocument() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
    JexlIndexingFilter filter = new JexlIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);
    Inlinks inlinks = new Inlinks();
    doc.add("lang", "en");
    NutchDocument result = filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    Assert.assertNotNull(result);
    Assert.assertEquals(doc, result);
}
Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 5 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TestJexlIndexingFilter method testBlockNotMatchingDocuments.

@Test
public void testBlockNotMatchingDocuments() throws Exception {
    Configuration conf = NutchConfiguration.create();
    conf.set("index.jexl.filter", "doc.lang[0]=='en'");
    JexlIndexingFilter filter = new JexlIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    String title = "The Foo Page";
    Outlink[] outlinks = new Outlink[] { new Outlink("http://foo.com/", "Foo") };
    Metadata metaData = new Metadata();
    metaData.add("Language", "en/us");
    ParseData parseData = new ParseData(ParseStatus.STATUS_SUCCESS, title, outlinks, metaData);
    ParseImpl parse = new ParseImpl("this is a sample foo bar page. hope you enjoy it.", parseData);
    CrawlDatum crawlDatum = new CrawlDatum();
    crawlDatum.setFetchTime(100L);
    Inlinks inlinks = new Inlinks();
    doc.add("lang", "ru");
    NutchDocument result = filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), crawlDatum, inlinks);
    Assert.assertNull(result);
}
Also used : Outlink(org.apache.nutch.parse.Outlink) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Aggregations

ParseImpl (org.apache.nutch.parse.ParseImpl)31 ParseData (org.apache.nutch.parse.ParseData)29 Text (org.apache.hadoop.io.Text)21 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)21 ParseStatus (org.apache.nutch.parse.ParseStatus)21 Inlinks (org.apache.nutch.crawl.Inlinks)20 Outlink (org.apache.nutch.parse.Outlink)17 Test (org.junit.Test)17 NutchDocument (org.apache.nutch.indexer.NutchDocument)16 Metadata (org.apache.nutch.metadata.Metadata)15 Configuration (org.apache.hadoop.conf.Configuration)13 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)13 URL (java.net.URL)6 IOException (java.io.IOException)5 Inlink (org.apache.nutch.crawl.Inlink)5 Parse (org.apache.nutch.parse.Parse)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 ArrayList (java.util.ArrayList)4 ParseResult (org.apache.nutch.parse.ParseResult)4 MalformedURLException (java.net.MalformedURLException)3