Search in sources :

Example 1 with Generator

use of org.apache.nutch.crawl.Generator in project nutch by apache.

the class TestFetcher method testFetch.

@Test
public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
    // generate seedlist
    ArrayList<String> urls = new ArrayList<String>();
    addUrl(urls, "index.html");
    addUrl(urls, "pagea.html");
    addUrl(urls, "pageb.html");
    addUrl(urls, "dup_of_pagea.html");
    addUrl(urls, "nested_spider_trap.html");
    addUrl(urls, "exception.html");
    CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
    // inject
    Injector injector = new Injector(conf);
    injector.inject(crawldbPath, urlPath);
    // generate
    Generator g = new Generator(conf);
    Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE, false, false);
    long time = System.currentTimeMillis();
    // fetch
    Fetcher fetcher = new Fetcher(conf);
    // Set fetcher.parse to true
    conf.setBoolean("fetcher.parse", true);
    fetcher.fetch(generatedSegment[0], 1);
    time = System.currentTimeMillis() - time;
    // verify politeness, time taken should be more than (num_of_pages +1)*delay
    int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat("fetcher.server.delay", 5));
    Assert.assertTrue(time > minimumTime);
    // verify content
    Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME), "part-r-00000/data");
    @SuppressWarnings("resource") SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
    ArrayList<String> handledurls = new ArrayList<String>();
    READ_CONTENT: do {
        Text key = new Text();
        Content value = new Content();
        if (!reader.next(key, value))
            break READ_CONTENT;
        String contentString = new String(value.getContent());
        if (contentString.indexOf("Nutch fetcher test page") != -1) {
            handledurls.add(key.toString());
        }
    } while (true);
    reader.close();
    Collections.sort(urls);
    Collections.sort(handledurls);
    // verify that enough pages were handled
    Assert.assertEquals(urls.size(), handledurls.size());
    // verify that correct pages were handled
    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
    handledurls.clear();
    // verify parse data
    Path parseData = new Path(new Path(generatedSegment[0], ParseData.DIR_NAME), "part-r-00000/data");
    reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
    READ_PARSE_DATA: do {
        Text key = new Text();
        ParseData value = new ParseData();
        if (!reader.next(key, value))
            break READ_PARSE_DATA;
        // make sure they all contain "nutch.segment.name" and
        // "nutch.content.digest"
        // keys in parse metadata
        Metadata contentMeta = value.getContentMeta();
        if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
            handledurls.add(key.toString());
        }
    } while (true);
    Collections.sort(handledurls);
    Assert.assertEquals(urls.size(), handledurls.size());
    Assert.assertTrue(handledurls.containsAll(urls));
    Assert.assertTrue(urls.containsAll(handledurls));
}
Also used : Path(org.apache.hadoop.fs.Path) ArrayList(java.util.ArrayList) Metadata(org.apache.nutch.metadata.Metadata) Text(org.apache.hadoop.io.Text) SequenceFile(org.apache.hadoop.io.SequenceFile) ParseData(org.apache.nutch.parse.ParseData) Injector(org.apache.nutch.crawl.Injector) Content(org.apache.nutch.protocol.Content) Generator(org.apache.nutch.crawl.Generator) Test(org.junit.Test)

Example 2 with Generator

use of org.apache.nutch.crawl.Generator in project nutch by apache.

the class Benchmark method benchmark.

public BenchmarkResults benchmark(int seeds, int depth, int threads, int maxPerHost, long topN, boolean delete, String plugins) throws Exception {
    Configuration conf = getConf();
    conf.set("http.proxy.host", "localhost");
    conf.setInt("http.proxy.port", 8181);
    conf.set("http.agent.name", "test");
    conf.set("http.robots.agents", "test,*");
    if (!plugins.equals("default")) {
        conf.set("plugin.includes", plugins);
    }
    conf.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
    conf.set(Generator.GENERATOR_COUNT_MODE, Generator.GENERATOR_COUNT_VALUE_HOST);
    Job job = NutchJob.getInstance(getConf());
    FileSystem fs = FileSystem.get(conf);
    Path dir = new Path(getConf().get("hadoop.tmp.dir"), "bench-" + System.currentTimeMillis());
    fs.mkdirs(dir);
    Path rootUrlDir = new Path(dir, "seed");
    fs.mkdirs(rootUrlDir);
    createSeeds(fs, rootUrlDir, seeds);
    if (LOG.isInfoEnabled()) {
        LOG.info("crawl started in: " + dir);
        LOG.info("rootUrlDir = " + rootUrlDir);
        LOG.info("threads = " + threads);
        LOG.info("depth = " + depth);
    }
    BenchmarkResults res = new BenchmarkResults();
    res.delete = delete;
    res.depth = depth;
    res.plugins = plugins;
    res.seeds = seeds;
    res.threads = threads;
    res.topN = topN;
    Path crawlDb = new Path(dir + "/crawldb");
    Path linkDb = new Path(dir + "/linkdb");
    Path segments = new Path(dir + "/segments");
    res.elapsed = System.currentTimeMillis();
    Injector injector = new Injector(getConf());
    Generator generator = new Generator(getConf());
    Fetcher fetcher = new Fetcher(getConf());
    ParseSegment parseSegment = new ParseSegment(getConf());
    CrawlDb crawlDbTool = new CrawlDb(getConf());
    LinkDb linkDbTool = new LinkDb(getConf());
    // initialize crawlDb
    long start = System.currentTimeMillis();
    injector.inject(crawlDb, rootUrlDir);
    long delta = System.currentTimeMillis() - start;
    res.addTiming("inject", "0", delta);
    int i;
    for (i = 0; i < depth; i++) {
        // generate new segment
        start = System.currentTimeMillis();
        Path[] segs = generator.generate(crawlDb, segments, -1, topN, System.currentTimeMillis());
        delta = System.currentTimeMillis() - start;
        res.addTiming("generate", i + "", delta);
        if (segs == null) {
            LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
            break;
        }
        start = System.currentTimeMillis();
        // fetch it
        fetcher.fetch(segs[0], threads);
        delta = System.currentTimeMillis() - start;
        res.addTiming("fetch", i + "", delta);
        if (!Fetcher.isParsing(conf)) {
            start = System.currentTimeMillis();
            // parse it, if needed
            parseSegment.parse(segs[0]);
            delta = System.currentTimeMillis() - start;
            res.addTiming("parse", i + "", delta);
        }
        start = System.currentTimeMillis();
        // update crawldb
        crawlDbTool.update(crawlDb, segs, true, true);
        delta = System.currentTimeMillis() - start;
        res.addTiming("update", i + "", delta);
        start = System.currentTimeMillis();
        // invert links
        linkDbTool.invert(linkDb, segs, true, true, false);
        delta = System.currentTimeMillis() - start;
        res.addTiming("invert", i + "", delta);
        // delete data
        if (delete) {
            for (Path p : segs) {
                fs.delete(p, true);
            }
        }
    }
    if (i == 0) {
        LOG.warn("No URLs to fetch - check your seed list and URL filters.");
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("crawl finished: " + dir);
    }
    res.elapsed = System.currentTimeMillis() - res.elapsed;
    CrawlDbReader dbreader = new CrawlDbReader();
    dbreader.processStatJob(crawlDb.toString(), conf, false);
    return res;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) CrawlDbReader(org.apache.nutch.crawl.CrawlDbReader) ParseSegment(org.apache.nutch.parse.ParseSegment) LinkDb(org.apache.nutch.crawl.LinkDb) Injector(org.apache.nutch.crawl.Injector) FileSystem(org.apache.hadoop.fs.FileSystem) Fetcher(org.apache.nutch.fetcher.Fetcher) NutchJob(org.apache.nutch.util.NutchJob) Job(org.apache.hadoop.mapreduce.Job) CrawlDb(org.apache.nutch.crawl.CrawlDb) Generator(org.apache.nutch.crawl.Generator)

Aggregations

Path (org.apache.hadoop.fs.Path)2 Generator (org.apache.nutch.crawl.Generator)2 Injector (org.apache.nutch.crawl.Injector)2 ArrayList (java.util.ArrayList)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 SequenceFile (org.apache.hadoop.io.SequenceFile)1 Text (org.apache.hadoop.io.Text)1 Job (org.apache.hadoop.mapreduce.Job)1 CrawlDb (org.apache.nutch.crawl.CrawlDb)1 CrawlDbReader (org.apache.nutch.crawl.CrawlDbReader)1 LinkDb (org.apache.nutch.crawl.LinkDb)1 Fetcher (org.apache.nutch.fetcher.Fetcher)1 Metadata (org.apache.nutch.metadata.Metadata)1 ParseData (org.apache.nutch.parse.ParseData)1 ParseSegment (org.apache.nutch.parse.ParseSegment)1 Content (org.apache.nutch.protocol.Content)1 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)1 NutchJob (org.apache.nutch.util.NutchJob)1 Test (org.junit.Test)1