Search in sources :

Example 36 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestLinksIndexingFilter method testFilterOutlinks.

@Test
public void testFilterOutlinks() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    filter.setConf(conf);
    Outlink[] outlinks = generateOutlinks();
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals(1, doc.getField("outlinks").getValues().size());
    Assert.assertEquals("Filter outlinks, allow only those from a different host", outlinks[0].getToUrl(), doc.getFieldValue("outlinks"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 37 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestLinksIndexingFilter method testNoFilterInlinks.

@Test
public void testNoFilterInlinks() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
    filter.setConf(conf);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com", "test"));
    inlinks.add(new Inlink("http://www.example.com", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    Assert.assertEquals("All inlinks must be indexed even those from the same host", inlinks.size(), doc.getField("inlinks").getValues().size());
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) Test(org.junit.Test)

Example 38 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestLinksIndexingFilter method testNoFilterOutlinks.

@Test
public void testNoFilterOutlinks() throws Exception {
    filter.setConf(conf);
    Outlink[] outlinks = generateOutlinks();
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals("All outlinks must be indexed even those from the same host", outlinks.length, doc.getField("outlinks").getValues().size());
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 39 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestLinksIndexingFilter method testIndexOnlyHostPart.

@Test
public void testIndexOnlyHostPart() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    filter.setConf(conf);
    Outlink[] outlinks = generateOutlinks(true);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
    inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
    inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    NutchField docOutlinks = doc.getField("outlinks");
    Assert.assertEquals("Only the host portion of the outlink URL must be indexed", new URL("http://www.test.com").getHost(), docOutlinks.getValues().get(0));
    Assert.assertEquals("The inlinks coming from the same host must count only once", 1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchField(org.apache.nutch.indexer.NutchField) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) URL(java.net.URL) Test(org.junit.Test)

Example 40 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestIndexReplace method parseAndFilterFile.

/**
 * Run a test file through the Nutch parser and index filters.
 *
 * @param fileName
 * @param conf
 * @return the Nutch document with the replace indexer applied
 */
public NutchDocument parseAndFilterFile(String fileName, Configuration conf) {
    NutchDocument doc = new NutchDocument();
    BasicIndexingFilter basicIndexer = new BasicIndexingFilter();
    basicIndexer.setConf(conf);
    Assert.assertNotNull(basicIndexer);
    MetadataIndexer metaIndexer = new MetadataIndexer();
    metaIndexer.setConf(conf);
    Assert.assertNotNull(basicIndexer);
    ReplaceIndexer replaceIndexer = new ReplaceIndexer();
    replaceIndexer.setConf(conf);
    Assert.assertNotNull(replaceIndexer);
    try {
        String urlString = "file:" + sampleDir + fileSeparator + fileName;
        Text text = new Text(urlString);
        CrawlDatum crawlDatum = new CrawlDatum();
        Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
        Content content = protocol.getProtocolOutput(text, crawlDatum).getContent();
        Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
        crawlDatum.setFetchTime(100L);
        Inlinks inlinks = new Inlinks();
        doc = basicIndexer.filter(doc, parse, text, crawlDatum, inlinks);
        doc = metaIndexer.filter(doc, parse, text, crawlDatum, inlinks);
        doc = replaceIndexer.filter(doc, parse, text, crawlDatum, inlinks);
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.toString());
    }
    return doc;
}
Also used : NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseUtil(org.apache.nutch.parse.ParseUtil) Parse(org.apache.nutch.parse.Parse) MetadataIndexer(org.apache.nutch.indexer.metadata.MetadataIndexer) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) ProtocolFactory(org.apache.nutch.protocol.ProtocolFactory) Content(org.apache.nutch.protocol.Content) BasicIndexingFilter(org.apache.nutch.indexer.basic.BasicIndexingFilter) Protocol(org.apache.nutch.protocol.Protocol)

Aggregations

CrawlDatum (org.apache.nutch.crawl.CrawlDatum)66 Text (org.apache.hadoop.io.Text)60 Test (org.junit.Test)31 Inlinks (org.apache.nutch.crawl.Inlinks)25 Configuration (org.apache.hadoop.conf.Configuration)24 ParseData (org.apache.nutch.parse.ParseData)22 ParseImpl (org.apache.nutch.parse.ParseImpl)21 NutchDocument (org.apache.nutch.indexer.NutchDocument)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 Content (org.apache.nutch.protocol.Content)19 Parse (org.apache.nutch.parse.Parse)15 Metadata (org.apache.nutch.metadata.Metadata)14 ParseStatus (org.apache.nutch.parse.ParseStatus)14 ParseUtil (org.apache.nutch.parse.ParseUtil)13 Protocol (org.apache.nutch.protocol.Protocol)13 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)13 URL (java.net.URL)11 Outlink (org.apache.nutch.parse.Outlink)11 IOException (java.io.IOException)7 ArrayList (java.util.ArrayList)5