use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method testGlobalReplacement.
/**
* Test metatag value replacement using global replacement settings.
*
* The index.replace.regexp property does not use hostmatch or urlmatch, so
* all patterns are global.
*/
@Test
public void testGlobalReplacement() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
String expectedAuthor = "Peter D. Ciuffetti";
String indexReplaceProperty = " metatag.description=/this(.*)plugin/this awesome plugin/\n" + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
Assert.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method testUrlMatchesPattern.
/**
* Test URL pattern matching
*/
@Test
public void testUrlMatchesPattern() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
String expectedAuthor = "Peter D. Ciuffetti";
String indexReplaceProperty = " urlmatch=.*.html\n" + " metatag.description=/this(.*)plugin/this awesome plugin/\n" + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Assert that our metatags have changed.
Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
Assert.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method testReplacementsRunInSpecifedOrder.
/**
* Test order-specific replacement settings.
*
* This makes multiple replacements on the same field and will produce the
* expected value only if the replacements are run in the order specified.
*/
@Test
public void testReplacementsRunInSpecifedOrder() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String indexReplaceProperty = " metatag.description=/this plugin/this amazing plugin/\n" + " metatag.description=/this amazing plugin/this valuable plugin/\n" + " metatag.description=/this valuable plugin/this cool plugin/\n" + " metatag.description=/this cool plugin/this wicked plugin/\n" + " metatag.description=/this wicked plugin/this awesome plugin/\n";
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Check that the value produced by the last replacement has worked.
Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method parseAndFilterFile.
/**
* Run a test file through the Nutch parser and index filters.
*
* @param fileName
* @param conf
* @return the Nutch document with the replace indexer applied
*/
public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
NutchDocument doc = new NutchDocument();
BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
basicIndexer.setConf(conf);
Assert.assertNotNull(basicIndexer);
MetadataIndexer metaIndexer = new MetadataIndexer();
metaIndexer.setConf(conf);
Assert.assertNotNull(basicIndexer);
ReplaceIndexer replaceIndexer = new ReplaceIndexer();
replaceIndexer.setConf(conf);
Assert.assertNotNull(replaceIndexer);
try {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Text text = new Text(urlString);
CrawlDatum crawlDatum = new CrawlDatum();
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(text, crawlDatum).getContent();
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
crawlDatum.setFetchTime(100L);
Inlinks inlinks = new Inlinks();
doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
return doc;
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method testInvalidPatterns.
/**
* Test that invalid property settings are handled and ignored.
*
* This test provides an invalid property setting that will fail property
* parsing and Pattern.compile. The expected outcome is that the patterns will
* not cause failure and the targeted fields will not be modified by the
* filter.
*/
@Test
public void testInvalidPatterns() {
String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
String expectedAuthor = "Peter Ciuffetti";
// Contains: invalid pattern, invalid flags, incomplete property
String indexReplaceProperty = " metatag.description=/this\\s+**plugin/this awesome plugin/\n" + " metatag.keywords=/\\,/\\!/what\n" + " metatag.author=#notcomplete";
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Assert that our metatags have not changed.
Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
Assert.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
Aggregations