Examples with NutchDocument - org.apache.nutch.indexer.NutchDocument

Example 11 with NutchDocument

use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.

the class TestMoreIndexingFilter method testNoParts.

/**
 * @since NUTCH-901
 */
@Test
public void testNoParts() {
    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    try {
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertTrue(doc.getFieldNames().contains("type"));
    Assert.assertEquals(1, doc.getField("type").getValues().size());
    Assert.assertEquals("text/html", doc.getFieldValue("type"));
}

Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) IndexingException(org.apache.nutch.indexer.IndexingException) Test(org.junit.Test)

Example 12 with NutchDocument

use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.

the class TestIndexReplace method testReplacementsDifferentTarget.

/**
 * Test a replacement pattern that uses the target field feature.
 * Check that the input is not modifid and that the taret field is added.
 */
@Test
public void testReplacementsDifferentTarget() {
    String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
    String expectedTargetDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
    String indexReplaceProperty = "  metatag.description:new=/this plugin/this awesome plugin/";
    Configuration conf = NutchConfiguration.create();
    conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
    conf.set("metatags.names", "author,description,keywords");
    conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
    // Not necessary but helpful when debugging the filter.
    conf.set("http.timeout", "99999999999");
    // Run the document through the parser and index filters.
    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
    // Check that the input field has not been modified
    Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
    // Check that the output field has created
    Assert.assertEquals(expectedTargetDescription, doc.getFieldValue("new"));
}

Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) Test(org.junit.Test)

Example 13 with NutchDocument

use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.

the class TestIndexReplace method testGlobalAndUrlNotMatchesPattern.

/**
 * Test a global pattern match for description and URL pattern match for
 * keywords and author.
 *
 * Only the global match should be triggered.
 */
@Test
public void testGlobalAndUrlNotMatchesPattern() {
    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
    String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
    String expectedAuthor = "Peter Ciuffetti";
    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" + "  urlmatch=.*.xml\n" + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
    Configuration conf = NutchConfiguration.create();
    conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
    conf.set("metatags.names", "author,description,keywords");
    conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
    // Not necessary but helpful when debugging the filter.
    conf.set("http.timeout", "99999999999");
    // Run the document through the parser and index filters.
    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
    // Assert that description has changed and the others have not changed.
    Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
    Assert.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}

Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) Test(org.junit.Test)

Example 14 with NutchDocument

use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.

the class TestIndexReplace method testGlobalAndUrlMatchesPattern.

/**
 * Test a global pattern match for description and URL pattern match for
 * keywords and author.
 *
 * All three should be triggered. It also tests replacement groups.
 */
@Test
public void testGlobalAndUrlMatchesPattern() {
    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
    String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
    String expectedAuthor = "Peter D. Ciuffetti";
    String indexReplaceProperty = "  metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" + "  urlmatch=.*.html\n" + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
    Configuration conf = NutchConfiguration.create();
    conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
    conf.set("metatags.names", "author,description,keywords");
    conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
    // Not necessary but helpful when debugging the filter.
    conf.set("http.timeout", "99999999999");
    // Run the document through the parser and index filters.
    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
    // Assert that our metatags have changed.
    Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
    Assert.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}

Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) Test(org.junit.Test)

Example 15 with NutchDocument

use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.

the class TestStaticFieldIndexerTest method testNormalScenario.

/**
 * Test that valid field:value pairs are added to the document
 *
 * @throws Exception
 */
@Test
public void testNormalScenario() throws Exception {
    conf.set("index.static", "field1:val1, field2    :      val2 val3     , field3, field4 :val4 , ");
    Assert.assertNotNull(filter);
    filter.setConf(conf);
    NutchDocument doc = new NutchDocument();
    try {
        filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
    Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
    Assert.assertTrue("test if doc has field1", doc.getField("field1").getValues().contains("val1"));
    Assert.assertTrue("test if doc has field2", doc.getField("field2").getValues().contains("val2"));
    Assert.assertTrue("test if doc has field4", doc.getField("field4").getValues().contains("val4"));
}

Also used : NutchDocument(org.apache.nutch.indexer.NutchDocument) Test(org.junit.Test)

Aggregations

NutchDocument (org.apache.nutch.indexer.NutchDocument)37 Test (org.junit.Test)33 Text (org.apache.hadoop.io.Text)20 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)20 Inlinks (org.apache.nutch.crawl.Inlinks)20 Configuration (org.apache.hadoop.conf.Configuration)17 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)17 ParseData (org.apache.nutch.parse.ParseData)16 ParseImpl (org.apache.nutch.parse.ParseImpl)16 ParseStatus (org.apache.nutch.parse.ParseStatus)10 Outlink (org.apache.nutch.parse.Outlink)9 Metadata (org.apache.nutch.metadata.Metadata)7 Inlink (org.apache.nutch.crawl.Inlink)5 URL (java.net.URL)3 Job (org.apache.hadoop.mapreduce.Job)3 IndexingException (org.apache.nutch.indexer.IndexingException)2 BasicIndexingFilter (org.apache.nutch.indexer.basic.BasicIndexingFilter)2 BufferedReader (java.io.BufferedReader)1 IOException (java.io.IOException)1 InputStreamReader (java.io.InputStreamReader)1