use of org.apache.nutch.indexer.metadata.MetadataIndexer in project nutch by apache.
the class TestIndexReplace method parseAndFilterFile.
/**
* Run a test file through the Nutch parser and index filters.
*
* @param fileName
* @param conf
* @return the Nutch document with the replace indexer applied
*/
public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
NutchDocument doc = new NutchDocument();
BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
basicIndexer.setConf(conf);
Assert.assertNotNull(basicIndexer);
MetadataIndexer metaIndexer = new MetadataIndexer();
metaIndexer.setConf(conf);
Assert.assertNotNull(basicIndexer);
ReplaceIndexer replaceIndexer = new ReplaceIndexer();
replaceIndexer.setConf(conf);
Assert.assertNotNull(replaceIndexer);
try {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Text text = new Text(urlString);
CrawlDatum crawlDatum = new CrawlDatum();
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(text, crawlDatum).getContent();
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
crawlDatum.setFetchTime(100L);
Inlinks inlinks = new Inlinks();
doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
return doc;
}
Aggregations