Examples with NutchDocument - org.apache.nutch.indexer.NutchDocument

Example 31 with NutchDocument

use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.

the class TestIndexReplace method testUrlNotMatchesPattern.

/**
 * Test URL pattern not matching.
 *
 * Expected result is that the filter does not change the fields.
 */
@Test
public void testUrlNotMatchesPattern() {
    String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
    String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
    String expectedAuthor = "Peter Ciuffetti";
    String indexReplaceProperty = " urlmatch=.*.xml\n" + "  metatag.description=/this(.*)plugin/this awesome plugin/\n" + "  metatag.keywords=/\\,/\\!/\n" + "  metatag.author=/\\s+/ D. /\n";
    Configuration conf = NutchConfiguration.create();
    conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
    conf.set("metatags.names", "author,description,keywords");
    conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
    // Not necessary but helpful when debugging the filter.
    conf.set("http.timeout", "99999999999");
    // Run the document through the parser and index filters.
    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
    // Assert that our metatags have not changed.
    Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
    Assert.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
    Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}

Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) Test(org.junit.Test)

Example 32 with NutchDocument

use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.

the class TestIndexReplace method testReplacementsWithFlags.

/**
 * Test a replacement pattern that uses the flags feature.
 *
 * A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to match
 * any case.
 */
@Test
public void testReplacementsWithFlags() {
    String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
    String indexReplaceProperty = "  metatag.description=/THIS PLUGIN/this awesome plugin/2";
    Configuration conf = NutchConfiguration.create();
    conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
    conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
    conf.set("metatags.names", "author,description,keywords");
    conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
    // Not necessary but helpful when debugging the filter.
    conf.set("http.timeout", "99999999999");
    // Run the document through the parser and index filters.
    NutchDocument doc = parseAndFilterFile(sampleFile, conf);
    // Check that the value produced by the case-insensitive replacement has
    // worked.
    Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
}

Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) Test(org.junit.Test)

Example 33 with NutchDocument

use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.

the class TestStaticFieldIndexerTest method testEmptyIndexStatic.

/**
 * Test that empty {@code index.static} does not add anything to the document
 *
 * @throws Exception
 */
@Test
public void testEmptyIndexStatic() throws Exception {
    Assert.assertNotNull(filter);
    filter.setConf(conf);
    NutchDocument doc = new NutchDocument();
    try {
        filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty());
}

Also used : NutchDocument(org.apache.nutch.indexer.NutchDocument) Test(org.junit.Test)

Example 34 with NutchDocument

use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.

the class TestStaticFieldIndexerTest method testCustomMulticharacterDelimiters.

/**
 * Test for NUTCH-2052 custom delimiters in index.static.
 *
 * @throws Exception
 */
@Test
public void testCustomMulticharacterDelimiters() throws Exception {
    conf.set("index.static.fieldsep", "\n\n");
    conf.set("index.static.keysep", "\t\t");
    conf.set("index.static.valuesep", "***");
    conf.set("index.static", "field1\t\tval1\n\n" + "field2\t\tval2***val3\n\n" + "field3\n\n" + "field4\t\tval4\n\n\n\n");
    Assert.assertNotNull(filter);
    filter.setConf(conf);
    NutchDocument doc = new NutchDocument();
    try {
        filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
    Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
    Assert.assertTrue("test if doc has field1", doc.getField("field1").getValues().contains("val1"));
    Assert.assertTrue("test if doc has field2", doc.getField("field2").getValues().contains("val2"));
    Assert.assertTrue("test if doc has field4", doc.getField("field4").getValues().contains("val4"));
}

Also used : NutchDocument(org.apache.nutch.indexer.NutchDocument) Test(org.junit.Test)

Example 35 with NutchDocument

use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.

the class TestStaticFieldIndexerTest method testCustomDelimiters.

/**
 * Test for NUTCH-2052 custom delimiters in index.static.
 *
 * @throws Exception
 */
@Test
public void testCustomDelimiters() throws Exception {
    conf.set("index.static.fieldsep", ">");
    conf.set("index.static.keysep", "=");
    conf.set("index.static.valuesep", "|");
    conf.set("index.static", "field1=val1>field2    =      val2|val3     >field3>field4 =val4 > ");
    Assert.assertNotNull(filter);
    filter.setConf(conf);
    NutchDocument doc = new NutchDocument();
    try {
        filter.filter(doc, parse, url, crawlDatum, inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
    Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
    Assert.assertTrue("test if doc has field1", doc.getField("field1").getValues().contains("val1"));
    Assert.assertTrue("test if doc has field2", doc.getField("field2").getValues().contains("val2"));
    Assert.assertTrue("test if doc has field4", doc.getField("field4").getValues().contains("val4"));
}

Also used : NutchDocument(org.apache.nutch.indexer.NutchDocument) Test(org.junit.Test)

Aggregations

NutchDocument (org.apache.nutch.indexer.NutchDocument)37 Test (org.junit.Test)33 Text (org.apache.hadoop.io.Text)20 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)20 Inlinks (org.apache.nutch.crawl.Inlinks)20 Configuration (org.apache.hadoop.conf.Configuration)17 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)17 ParseData (org.apache.nutch.parse.ParseData)16 ParseImpl (org.apache.nutch.parse.ParseImpl)16 ParseStatus (org.apache.nutch.parse.ParseStatus)10 Outlink (org.apache.nutch.parse.Outlink)9 Metadata (org.apache.nutch.metadata.Metadata)7 Inlink (org.apache.nutch.crawl.Inlink)5 URL (java.net.URL)3 Job (org.apache.hadoop.mapreduce.Job)3 IndexingException (org.apache.nutch.indexer.IndexingException)2 BasicIndexingFilter (org.apache.nutch.indexer.basic.BasicIndexingFilter)2 BufferedReader (java.io.BufferedReader)1 IOException (java.io.IOException)1 InputStreamReader (java.io.InputStreamReader)1