use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestMetatagParser method testMultiValueMetatags.
@Test
public /**
* test multiple metatags resulting in metadata with multiple values
*/
void testMultiValueMetatags() {
Configuration conf = NutchConfiguration.create();
conf.set("metatags.names", "keywords,DC.creator");
conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
Metadata parseMeta = parseMeta(sampleFileMultival, conf);
String failMessage = "One value of metatag with multiple values is missing: ";
Set<String> valueSet = new TreeSet<String>();
for (String val : parseMeta.getValues("metatag.dc.creator")) {
valueSet.add(val);
}
String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" };
for (String val : expectedValues1) {
Assert.assertTrue(failMessage + val, valueSet.contains(val));
}
valueSet.clear();
for (String val : parseMeta.getValues("metatag.keywords")) {
valueSet.add(val);
}
String[] expectedValues2 = { "robot d'indexation", "web crawler", "Webcrawler" };
for (String val : expectedValues2) {
Assert.assertTrue(failMessage + val, valueSet.contains(val));
}
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class ZipParser method main.
public static void main(String[] args) throws IOException {
if (args.length < 1) {
System.out.println("ZipParser <zip_file>");
System.exit(1);
}
File file = new File(args[0]);
String url = "file:" + file.getCanonicalPath();
FileInputStream in = new FileInputStream(file);
byte[] bytes = new byte[in.available()];
in.read(bytes);
in.close();
Configuration conf = NutchConfiguration.create();
ZipParser parser = new ZipParser();
parser.setConf(conf);
Metadata meta = new Metadata();
meta.add(Response.CONTENT_LENGTH, "" + file.length());
ParseResult parseResult = parser.getParse(new Content(url, url, bytes, "application/zip", meta, conf));
Parse p = parseResult.get(url);
System.out.println(parseResult.size());
System.out.println("Parse Text:");
System.out.println(p.getText());
System.out.println("Parse Data:");
System.out.println(p.getData());
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestIndexingFilters method testNutchDocumentNullIndexingFilter.
/**
* Test behaviour when NutchDOcument is null
*/
@Test
public void testNutchDocumentNullIndexingFilter() throws IndexingException {
Configuration conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
IndexingFilters filters = new IndexingFilters(conf);
NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertNull(doc);
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestIndexingFilters method testFilterCacheIndexingFilter.
/**
* Test behaviour when reset the index filter order will not take effect
*
* @throws IndexingException
*/
@Test
public void testFilterCacheIndexingFilter() throws IndexingException {
Configuration conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);
IndexingFilters filters1 = new IndexingFilters(conf);
NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
// add another index filter
String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
// set content metadata
Metadata md = new Metadata();
md.add("example", "data");
// set content metadata property defined in MetadataIndexer
conf.set("index.content.md", "example");
// add MetadataIndxer filter
conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
IndexingFilters filters2 = new IndexingFilters(conf);
NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames().size());
}
use of org.apache.nutch.metadata.Metadata in project nutch by apache.
the class TestParseData method testMaxOutlinks.
@Test
public void testMaxOutlinks() throws Exception {
Outlink[] outlinks = new Outlink[128];
for (int i = 0; i < outlinks.length; i++) {
outlinks[i] = new Outlink("http://outlink.com/" + i, "Outlink" + i);
}
ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS, "Max Outlinks Title", outlinks, new Metadata());
ParseData data = (ParseData) WritableTestUtils.writeRead(original, null);
Assert.assertEquals(outlinks.length, data.getOutlinks().length);
}
Aggregations