use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method testUrlNotMatchesPattern.
/**
* Test URL pattern not matching.
*
* Expected result is that the filter does not change the fields.
*/
@Test
public void testUrlNotMatchesPattern() {
String expectedDescription = "With this plugin, I control the description! Bwuhuhuhaha!";
String expectedKeywords = "Breathtaking, Riveting, Two Thumbs Up!";
String expectedAuthor = "Peter Ciuffetti";
String indexReplaceProperty = " urlmatch=.*.xml\n" + " metatag.description=/this(.*)plugin/this awesome plugin/\n" + " metatag.keywords=/\\,/\\!/\n" + " metatag.author=/\\s+/ D. /\n";
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Assert that our metatags have not changed.
Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
Assert.assertEquals(expectedKeywords, doc.getFieldValue("metatag.keywords"));
Assert.assertEquals(expectedAuthor, doc.getFieldValue("metatag.author"));
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestIndexReplace method testReplacementsWithFlags.
/**
* Test a replacement pattern that uses the flags feature.
*
* A 2 is Pattern.CASE_INSENSITIVE. We look for upper case and expect to match
* any case.
*/
@Test
public void testReplacementsWithFlags() {
String expectedDescription = "With this awesome plugin, I control the description! Bwuhuhuhaha!";
String indexReplaceProperty = " metatag.description=/THIS PLUGIN/this awesome plugin/2";
Configuration conf = NutchConfiguration.create();
conf.set("plugin.includes", "protocol-file|urlfilter-regex|parse-(html|metatags)|index-(basic|anchor|metadata|static|replace)|urlnormalizer-(pass|regex|basic)");
conf.set(INDEX_REPLACE_PROPERTY, indexReplaceProperty);
conf.set("metatags.names", "author,description,keywords");
conf.set("index.parse.md", "metatag.author,metatag.description,metatag.keywords");
// Not necessary but helpful when debugging the filter.
conf.set("http.timeout", "99999999999");
// Run the document through the parser and index filters.
NutchDocument doc = parseAndFilterFile(sampleFile, conf);
// Check that the value produced by the case-insensitive replacement has
// worked.
Assert.assertEquals(expectedDescription, doc.getFieldValue("metatag.description"));
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestStaticFieldIndexerTest method testEmptyIndexStatic.
/**
* Test that empty {@code index.static} does not add anything to the document
*
* @throws Exception
*/
@Test
public void testEmptyIndexStatic() throws Exception {
Assert.assertNotNull(filter);
filter.setConf(conf);
NutchDocument doc = new NutchDocument();
try {
filter.filter(doc, parse, url, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertTrue("tests if no field is set for empty index.static", doc.getFieldNames().isEmpty());
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestStaticFieldIndexerTest method testCustomMulticharacterDelimiters.
/**
* Test for NUTCH-2052 custom delimiters in index.static.
*
* @throws Exception
*/
@Test
public void testCustomMulticharacterDelimiters() throws Exception {
conf.set("index.static.fieldsep", "\n\n");
conf.set("index.static.keysep", "\t\t");
conf.set("index.static.valuesep", "***");
conf.set("index.static", "field1\t\tval1\n\n" + "field2\t\tval2***val3\n\n" + "field3\n\n" + "field4\t\tval4\n\n\n\n");
Assert.assertNotNull(filter);
filter.setConf(conf);
NutchDocument doc = new NutchDocument();
try {
filter.filter(doc, parse, url, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
Assert.assertTrue("test if doc has field1", doc.getField("field1").getValues().contains("val1"));
Assert.assertTrue("test if doc has field2", doc.getField("field2").getValues().contains("val2"));
Assert.assertTrue("test if doc has field4", doc.getField("field4").getValues().contains("val4"));
}
use of org.apache.nutch.indexer.NutchDocument in project nutch by apache.
the class TestStaticFieldIndexerTest method testCustomDelimiters.
/**
* Test for NUTCH-2052 custom delimiters in index.static.
*
* @throws Exception
*/
@Test
public void testCustomDelimiters() throws Exception {
conf.set("index.static.fieldsep", ">");
conf.set("index.static.keysep", "=");
conf.set("index.static.valuesep", "|");
conf.set("index.static", "field1=val1>field2 = val2|val3 >field3>field4 =val4 > ");
Assert.assertNotNull(filter);
filter.setConf(conf);
NutchDocument doc = new NutchDocument();
try {
filter.filter(doc, parse, url, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertFalse("test if doc is not empty", doc.getFieldNames().isEmpty());
Assert.assertEquals("test if doc has 3 fields", 3, doc.getFieldNames().size());
Assert.assertTrue("test if doc has field1", doc.getField("field1").getValues().contains("val1"));
Assert.assertTrue("test if doc has field2", doc.getField("field2").getValues().contains("val2"));
Assert.assertTrue("test if doc has field4", doc.getField("field4").getValues().contains("val4"));
}
Aggregations