use of org.apache.nutch.crawl.Inlinks in project nutch by apache.
the class TestIndexReplace method parseAndFilterFile.
/**
* Run a test file through the Nutch parser and index filters.
*
* @param fileName
* @param conf
* @return the Nutch document with the replace indexer applied
*/
public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
NutchDocument doc = new NutchDocument();
BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
basicIndexer.setConf(conf);
Assert.assertNotNull(basicIndexer);
MetadataIndexer metaIndexer = new MetadataIndexer();
metaIndexer.setConf(conf);
Assert.assertNotNull(basicIndexer);
ReplaceIndexer replaceIndexer = new ReplaceIndexer();
replaceIndexer.setConf(conf);
Assert.assertNotNull(replaceIndexer);
try {
String urlString = "file:" + sampleDir + fileSeparator + fileName;
Text text = new Text(urlString);
CrawlDatum crawlDatum = new CrawlDatum();
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
Content content = protocol.getProtocolOutput(text, crawlDatum).getContent();
Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
crawlDatum.setFetchTime(100L);
Inlinks inlinks = new Inlinks();
doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.toString());
}
return doc;
}
use of org.apache.nutch.crawl.Inlinks in project nutch by apache.
the class TestStaticFieldIndexerTest method setUp.
@Before
public void setUp() throws Exception {
conf = NutchConfiguration.create();
parse = new ParseImpl();
url = new Text("http://nutch.apache.org/index.html");
crawlDatum = new CrawlDatum();
inlinks = new Inlinks();
filter = new StaticFieldIndexer();
}
use of org.apache.nutch.crawl.Inlinks in project nutch by apache.
the class MimeTypeIndexingFilter method main.
/**
* Main method for invoking this tool
*
* @throws IOException
* @throws IndexingException
*/
public static void main(String[] args) throws IOException, IndexingException {
Option helpOpt = new Option("h", "help", false, "show this help message");
Option rulesOpt = OptionBuilder.withArgName("file").hasArg().withDescription("Rules file to be used in the tests relative to the conf directory").isRequired().create("rules");
Options options = new Options();
options.addOption(helpOpt).addOption(rulesOpt);
CommandLineParser parser = new GnuParser();
HelpFormatter formatter = new HelpFormatter();
String rulesFile;
try {
CommandLine line = parser.parse(options, args);
if (line.hasOption("help") || !line.hasOption("rules")) {
formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
return;
}
rulesFile = line.getOptionValue("rules");
} catch (UnrecognizedOptionException e) {
formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
return;
} catch (Exception e) {
LOG.error(StringUtils.stringifyException(e));
e.printStackTrace();
return;
}
MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
Configuration conf = NutchConfiguration.create();
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
filter.setConf(conf);
BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
String line;
while ((line = in.readLine()) != null && !line.isEmpty()) {
Metadata metadata = new Metadata();
metadata.set(Response.CONTENT_TYPE, line);
ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
NutchDocument doc = filter.filter(new NutchDocument(), parse, new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (doc != null) {
System.out.print("+ ");
System.out.println(line);
} else {
System.out.print("- ");
System.out.println(line);
}
}
}
use of org.apache.nutch.crawl.Inlinks in project nutch by apache.
the class TestIndexingFilters method testNutchDocumentNullIndexingFilter.
/**
* Test behaviour when NutchDOcument is null
*/
@Test
public void testNutchDocumentNullIndexingFilter() throws IndexingException {
Configuration conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
IndexingFilters filters = new IndexingFilters(conf);
NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertNull(doc);
}
use of org.apache.nutch.crawl.Inlinks in project nutch by apache.
the class TestIndexingFilters method testFilterCacheIndexingFilter.
/**
* Test behaviour when reset the index filter order will not take effect
*
* @throws IndexingException
*/
@Test
public void testFilterCacheIndexingFilter() throws IndexingException {
Configuration conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);
IndexingFilters filters1 = new IndexingFilters(conf);
NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
// add another index filter
String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
// set content metadata
Metadata md = new Metadata();
md.add("example", "data");
// set content metadata property defined in MetadataIndexer
conf.set("index.content.md", "example");
// add MetadataIndxer filter
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
IndexingFilters filters2 = new IndexingFilters(conf);
NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames().size());
}
Aggregations