Search in sources :

Example 21 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TestLinksIndexingFilter method testNoFilterInlinks.

@Test
public void testNoFilterInlinks() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "false");
    filter.setConf(conf);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com", "test"));
    inlinks.add(new Inlink("http://www.example.com", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    Assert.assertEquals("All inlinks must be indexed even those from the same host", inlinks.size(), doc.getField("inlinks").getValues().size());
}
Also used : ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) Test(org.junit.Test)

Example 22 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TestLinksIndexingFilter method testNoFilterOutlinks.

@Test
public void testNoFilterOutlinks() throws Exception {
    filter.setConf(conf);
    Outlink[] outlinks = generateOutlinks();
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
    Assert.assertEquals("All outlinks must be indexed even those from the same host", outlinks.length, doc.getField("outlinks").getValues().size());
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 23 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TestLinksIndexingFilter method testIndexOnlyHostPart.

@Test
public void testIndexOnlyHostPart() throws Exception {
    conf.set(LinksIndexingFilter.LINKS_INLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_OUTLINKS_HOST, "true");
    conf.set(LinksIndexingFilter.LINKS_ONLY_HOSTS, "true");
    filter.setConf(conf);
    Outlink[] outlinks = generateOutlinks(true);
    Inlinks inlinks = new Inlinks();
    inlinks.add(new Inlink("http://www.test.com/one-awesome-page", "test"));
    inlinks.add(new Inlink("http://www.test.com/other-awesome-page", "test"));
    inlinks.add(new Inlink("http://www.example.com/my-first-awesome-example", "example"));
    NutchDocument doc = filter.filter(new NutchDocument(), new ParseImpl("text", new ParseData(new ParseStatus(), "title", outlinks, metadata)), new Text("http://www.example.com/"), new CrawlDatum(), inlinks);
    NutchField docOutlinks = doc.getField("outlinks");
    Assert.assertEquals("Only the host portion of the outlink URL must be indexed", new URL("http://www.test.com").getHost(), docOutlinks.getValues().get(0));
    Assert.assertEquals("The inlinks coming from the same host must count only once", 1, doc.getField("inlinks").getValues().size());
    Assert.assertEquals("Only the host portion of the inlinks URL must be indexed", new URL("http://www.test.com").getHost(), doc.getFieldValue("inlinks"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchField(org.apache.nutch.indexer.NutchField) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Inlink(org.apache.nutch.crawl.Inlink) URL(java.net.URL) Test(org.junit.Test)

Example 24 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class TestStaticFieldIndexerTest method setUp.

@Before
public void setUp() throws Exception {
    conf = NutchConfiguration.create();
    parse = new ParseImpl();
    url = new Text("http://nutch.apache.org/index.html");
    crawlDatum = new CrawlDatum();
    inlinks = new Inlinks();
    filter = new StaticFieldIndexer();
}
Also used : ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Before(org.junit.Before)

Example 25 with ParseImpl

use of org.apache.nutch.parse.ParseImpl in project nutch by apache.

the class MimeTypeIndexingFilter method main.

/**
 * Main method for invoking this tool
 *
 * @throws IOException
 * @throws IndexingException
 */
public static void main(String[] args) throws IOException, IndexingException {
    Option helpOpt = new Option("h", "help", false, "show this help message");
    Option rulesOpt = OptionBuilder.withArgName("file").hasArg().withDescription("Rules file to be used in the tests relative to the conf directory").isRequired().create("rules");
    Options options = new Options();
    options.addOption(helpOpt).addOption(rulesOpt);
    CommandLineParser parser = new GnuParser();
    HelpFormatter formatter = new HelpFormatter();
    String rulesFile;
    try {
        CommandLine line = parser.parse(options, args);
        if (line.hasOption("help") || !line.hasOption("rules")) {
            formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
            return;
        }
        rulesFile = line.getOptionValue("rules");
    } catch (UnrecognizedOptionException e) {
        formatter.printHelp("org.apache.nutch.indexer.filter.MimeTypeIndexingFilter", options, true);
        return;
    } catch (Exception e) {
        LOG.error(StringUtils.stringifyException(e));
        e.printStackTrace();
        return;
    }
    MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter();
    Configuration conf = NutchConfiguration.create();
    conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, rulesFile);
    filter.setConf(conf);
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
    String line;
    while ((line = in.readLine()) != null && !line.isEmpty()) {
        Metadata metadata = new Metadata();
        metadata.set(Response.CONTENT_TYPE, line);
        ParseImpl parse = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
        NutchDocument doc = filter.filter(new NutchDocument(), parse, new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
        if (doc != null) {
            System.out.print("+ ");
            System.out.println(line);
        } else {
            System.out.print("- ");
            System.out.println(line);
        }
    }
}
Also used : Outlink(org.apache.nutch.parse.Outlink) Options(org.apache.commons.cli.Options) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) InputStreamReader(java.io.InputStreamReader) NutchDocument(org.apache.nutch.indexer.NutchDocument) GnuParser(org.apache.commons.cli.GnuParser) Metadata(org.apache.nutch.metadata.Metadata) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) UnrecognizedOptionException(org.apache.commons.cli.UnrecognizedOptionException) IOException(java.io.IOException) IndexingException(org.apache.nutch.indexer.IndexingException) HelpFormatter(org.apache.commons.cli.HelpFormatter) ParseStatus(org.apache.nutch.parse.ParseStatus) CommandLine(org.apache.commons.cli.CommandLine) ParseData(org.apache.nutch.parse.ParseData) BufferedReader(java.io.BufferedReader) ParseImpl(org.apache.nutch.parse.ParseImpl) Option(org.apache.commons.cli.Option) CommandLineParser(org.apache.commons.cli.CommandLineParser)

Aggregations

ParseImpl (org.apache.nutch.parse.ParseImpl)31 ParseData (org.apache.nutch.parse.ParseData)29 Text (org.apache.hadoop.io.Text)21 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)21 ParseStatus (org.apache.nutch.parse.ParseStatus)21 Inlinks (org.apache.nutch.crawl.Inlinks)20 Outlink (org.apache.nutch.parse.Outlink)17 Test (org.junit.Test)17 NutchDocument (org.apache.nutch.indexer.NutchDocument)16 Metadata (org.apache.nutch.metadata.Metadata)15 Configuration (org.apache.hadoop.conf.Configuration)13 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)13 URL (java.net.URL)6 IOException (java.io.IOException)5 Inlink (org.apache.nutch.crawl.Inlink)5 Parse (org.apache.nutch.parse.Parse)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 ArrayList (java.util.ArrayList)4 ParseResult (org.apache.nutch.parse.ParseResult)4 MalformedURLException (java.net.MalformedURLException)3