use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method testInvalidPatterns.
/**
* Test that invalid property settings are handled and ignored.
*
* This test provides an invalid property setting that will fail property
* parsing and Pattern.compile. The expected outcome is that the patterns will
* not cause failure and the targeted fields will not be modified by the
* filter.
*/
@Test
public void testInvalidPatterns() {
String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
String expectedAuthor = "Peter Ciuffetti";
// Contains: invalid pattern, invalid flags, incomplete property
String indexReplaceProperty = " metatag.description=/this\\s+**plugin/this awesome plugin/\n" + " metatag.keywords=/\\,/\\!/what\n" + " metatag.author=#notcomplete";
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Assert that our metatags have not changed.
Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
Assert.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method testGlobalReplacement.
/**
* Test metatag value replacement using global replacement settings.
*
* The index.replace.regexp property does not use hostmatch or urlmatch, so
* all patterns are global.
*/
@Test
public void testGlobalReplacement() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
String expectedAuthor = "Peter D. Ciuffetti";
String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/\n" + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
Assert.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
Aggregations