Search in sources :

Example 36 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestMetatagParser method testMultiValueMetatags.

@Test
public /**
 * test multiple metatags resulting in metadata with multiple values
 */
void testMultiValueMetatags() {
    Configuration conf = NutchConfiguration.create();
    conf.set("metatags.names", "keywords,DC.creator");
    conf.set("index.parse.md", "metatag.keywords,metatag.dc.creator");
    Metadata parseMeta = parseMeta(sampleFileMultival, conf);
    String failMessage = "One value of metatag with multiple values is missing: ";
    Set<String> valueSet = new TreeSet<String>();
    for (String val : parseMeta.getValues("metatag.dc.creator")) {
        valueSet.add(val);
    }
    String[] expectedValues1 = { "Doug Cutting", "Michael Cafarella" };
    for (String val : expectedValues1) {
        Assert.assertTrue(failMessage + val, valueSet.contains(val));
    }
    valueSet.clear();
    for (String val : parseMeta.getValues("metatag.keywords")) {
        valueSet.add(val);
    }
    String[] expectedValues2 = { "robot d'indexation", "web crawler", "Webcrawler" };
    for (String val : expectedValues2) {
        Assert.assertTrue(failMessage + val, valueSet.contains(val));
    }
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) TreeSet(java.util.TreeSet) Metadata(org.apache.nutch.metadata.Metadata) Test(org.junit.Test)

Example 37 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class ZipParser method main.

public static void main(String[] args) throws IOException {
    if (args.length < 1) {
        System.out.println("ZipParser <zip_file>");
        System.exit(1);
    }
    File file = new File(args[0]);
    String url = "file:" + file.getCanonicalPath();
    FileInputStream in = new FileInputStream(file);
    byte[] bytes = new byte[in.available()];
    in.read(bytes);
    in.close();
    Configuration conf = NutchConfiguration.create();
    ZipParser parser = new ZipParser();
    parser.setConf(conf);
    Metadata meta = new Metadata();
    meta.add(Response.CONTENT_LENGTH, "" + file.length());
    ParseResult parseResult = parser.getParse(new Content(url, url, bytes, "application/zip", meta, conf));
    Parse p = parseResult.get(url);
    System.out.println(parseResult.size());
    System.out.println("Parse Text:");
    System.out.println(p.getText());
    System.out.println("Parse Data:");
    System.out.println(p.getData());
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseResult(org.apache.nutch.parse.ParseResult) Content(org.apache.nutch.protocol.Content) Parse(org.apache.nutch.parse.Parse) Metadata(org.apache.nutch.metadata.Metadata) File(java.io.File) FileInputStream(java.io.FileInputStream)

Example 38 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestIndexingFilters method testNutchDocumentNullIndexingFilter.

/**
 * Test behaviour when NutchDOcument is null
 */
@Test
public void testNutchDocumentNullIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");
    IndexingFilters filters = new IndexingFilters(conf);
    NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertNull(doc);
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 39 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestIndexingFilters method testFilterCacheIndexingFilter.

/**
 * Test behaviour when reset the index filter order will not take effect
 *
 * @throws IndexingException
 */
@Test
public void testFilterCacheIndexingFilter() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    conf.addResource("nutch-default.xml");
    conf.addResource("crawl-tests.xml");
    String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);
    IndexingFilters filters1 = new IndexingFilters(conf);
    NutchDocument fdoc1 = filters1.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    // add another index filter
    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";
    // set content metadata
    Metadata md = new Metadata();
    md.add("example", "data");
    // set content metadata property defined in MetadataIndexer
    conf.set("index.content.md", "example");
    // add MetadataIndxer filter
    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);
    IndexingFilters filters2 = new IndexingFilters(conf);
    NutchDocument fdoc2 = filters2.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], md)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals(fdoc1.getFieldNames().size(), fdoc2.getFieldNames().size());
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 40 with Metadata

use of org.apache.nutch.metadata.Metadata in project nutch by apache.

the class TestParseData method testMaxOutlinks.

@Test
public void testMaxOutlinks() throws Exception {
    Outlink[] outlinks = new Outlink[128];
    for (int i = 0; i < outlinks.length; i++) {
        outlinks[i] = new Outlink("http://outlink.com/" + i, "Outlink" + i);
    }
    ParseData original = new ParseData(ParseStatus.STATUS_SUCCESS, "Max Outlinks Title", outlinks, new Metadata());
    ParseData data = (ParseData) WritableTestUtils.writeRead(original, null);
    Assert.assertEquals(outlinks.length, data.getOutlinks().length);
}
Also used : Metadata(org.apache.nutch.metadata.Metadata) Test(org.junit.Test)

Aggregations

Metadata (org.apache.nutch.metadata.Metadata)42 Configuration (org.apache.hadoop.conf.Configuration)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 ParseData (org.apache.nutch.parse.ParseData)19 Content (org.apache.nutch.protocol.Content)18 Test (org.junit.Test)17 Text (org.apache.hadoop.io.Text)16 Parse (org.apache.nutch.parse.Parse)16 ParseImpl (org.apache.nutch.parse.ParseImpl)15 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)14 Inlinks (org.apache.nutch.crawl.Inlinks)11 Outlink (org.apache.nutch.parse.Outlink)10 ParseStatus (org.apache.nutch.parse.ParseStatus)9 NutchDocument (org.apache.nutch.indexer.NutchDocument)7 ParseResult (org.apache.nutch.parse.ParseResult)7 FileInputStream (java.io.FileInputStream)5 IOException (java.io.IOException)5 File (java.io.File)4 ArrayList (java.util.ArrayList)4 ParseUtil (org.apache.nutch.parse.ParseUtil)4