Search in sources :

Example 11 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestLinksIndexingFilter method testFilterInlinks.

@Test
public void testFilterInlinks() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    filter.setConf(conf);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com", "test"));
    inlinks.add(new Inlink("http://www.example.com", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    Assert.assertEquals(1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Filter inlinks, allow only those from a different host", "http://www.test.com", doc.getFieldValue("inlinks"));
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) Test(org.junit.Test)

Example 12 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestMoreIndexingFilter method assertContentType.

private void assertContentType(Configuration conf, String source, String expected) throws IndexingException {
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_TYPE, source);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals("mime type not detected", expected, doc.getFieldValue("type"));
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks)

Example 13 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestMoreIndexingFilter method testContentDispositionTitle.

@Test
public void testContentDispositionTitle() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    Text url = new Text("http://www.example.com/");
    ParseImpl parseImpl = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
    NutchDocument doc = new NutchDocument();
    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
    Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
    /* NUTCH-1140: do not add second title to avoid a multi-valued title field */
    doc = new NutchDocument();
    doc.add("title", "title");
    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
    Assert.assertEquals("do not add second title by content-disposition", "title", doc.getFieldValue("title"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) NutchDocument(org.apache.nutch.indexer.NutchDocument) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 14 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class TestMoreIndexingFilter method testNoParts.

/**
 * @since NUTCH-901
 */
@Test
public void testNoParts() {
    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    try {
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertTrue(doc.getFieldNames().contains("type"));
    Assert.assertEquals(1, doc.getField("type").getValues().size());
    Assert.assertEquals("text/html", doc.getFieldValue("type"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) IndexingException(org.apache.nutch.indexer.IndexingException) Test(org.junit.Test)

Example 15 with ParseData

use of org.apache.nutch.parse.ParseData in project nutch by apache.

the class MimeTypeIndexingFilterTest method setUp.

@Before
public void setUp() throws Exception {
    for (int i = 0; i < MIME_TYPES.length; i++) {
        Metadata metadata = new Metadata();
        metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]);
        ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
        parses[i] = parse;
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) ParseData(org.apache.nutch.parse.ParseData) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) Before(org.junit.Before)

Aggregations

ParseData (org.apache.nutch.parse.ParseData)37 ParseImpl (org.apache.nutch.parse.ParseImpl)29 Text (org.apache.hadoop.io.Text)23 ParseStatus (org.apache.nutch.parse.ParseStatus)23 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)22 Outlink (org.apache.nutch.parse.Outlink)22 Inlinks (org.apache.nutch.crawl.Inlinks)19 Metadata (org.apache.nutch.metadata.Metadata)19 Test (org.junit.Test)19 NutchDocument (org.apache.nutch.indexer.NutchDocument)16 Configuration (org.apache.hadoop.conf.Configuration)14 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)14 Parse (org.apache.nutch.parse.Parse)9 URL (java.net.URL)7 ArrayList (java.util.ArrayList)6 ParseResult (org.apache.nutch.parse.ParseResult)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 IOException (java.io.IOException)5 Inlink (org.apache.nutch.crawl.Inlink)5 Content (org.apache.nutch.protocol.Content)5