use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestMoreIndexingFilter method testNoParts.
/**
* @since NUTCH-901
*/
@Test
public void testNoParts() {
Configuration conf = NutchConfiguration.create();
conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
try {
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertTrue(doc.getFieldNames().contains("type"));
Assert.assertEquals(1, doc.getField("type").getValues().size());
Assert.assertEquals("text/html", doc.getFieldValue("type"));
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method testReplacementsDifferentTarget.
/**
* Test a replacement pattern that uses the target field feature.
* Check that the input is not modifid and that the taret field is added.
*/
@Test
public void testReplacementsDifferentTarget() {
String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
String expectedTargetDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String indexReplaceProperty = " metatag.description:new=/this plugin/this awesome plugin/";
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Check that the input field has not been modified
Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
// Check that the output field has created
Assert.assertEquals(expectedTargetDescription, doc.getFieldValue("new"));
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method testGlobalAndUrlNotMatchesPattern.
/**
* Test a global pattern match for description and URL pattern match for
* keywords and author.
*
* Only the global match should be triggered.
*/
@Test
public void testGlobalAndUrlNotMatchesPattern() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
String expectedAuthor = "Peter Ciuffetti";
String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" + " urlmatch=.*.xml\n" + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Assert that description has changed and the others have not changed.
Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
Assert.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method testGlobalAndUrlMatchesPattern.
/**
* Test a global pattern match for description and URL pattern match for
* keywords and author.
*
* All three should be triggered. It also tests replacement groups.
*/
@Test
public void testGlobalAndUrlMatchesPattern() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking! Riveting! Two Thumbs Up!";
String expectedAuthor = "Peter D. Ciuffetti";
String indexReplaceProperty = " metatag.description=/this(.*)plugin/this$1awesome$1plugin/\n" + " urlmatch=.*.html\n" + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Assert that our metatags have changed.
Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
Assert.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestStaticFieldIndexerTest method testNormalScenario.
/**
* Test that valid field:value pairs are added to the document
*
* @throws Exception
*/
@Test
public void testNormalScenario() throws Exception {
conf.set("index.static", "field1:val1, field2 : val2 val3 , field3, field4 :val4 , ");
Assert.assertNotNull(filter);
filter.setConf(conf);
NutchDocument doc = new NutchDocument();
try {
filter.filter(doc, parse, url, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
Assert.assertTrue("test if doc has field1", doc.getField("field1").getValues().contains("val1"));
Assert.assertTrue("test if doc has field2", doc.getField("field2").getValues().contains("val2"));
Assert.assertTrue("test if doc has field4", doc.getField("field4").getValues().contains("val4"));
}
Aggregations