Examples with ParseSegment - org.apache.nutch.parse.ParseSegment

Example 1 with ParseSegment

use of org.apache.nutch.parse.ParseSegment in project nutch by apache.

the class Benchmark method benchmark.

public BenchmarkResults benchmark(int seeds, int depth, int threads, int maxPerHost, long topN, boolean delete, String plugins) throws Exception {
    Configuration conf = getConf();
    conf.set("http.proxy.host", "localhost");
    conf.setInt("http.proxy.port", 8181);
    conf.set("http.agent.name", "test");
    conf.set("http.robots.agents", "test,*");
    if (!plugins.equals("default")) {
        conf.set("plugin.includes", plugins);
    }
    conf.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
    conf.set(Generator.GENERATOR_COUNT_MODE, Generator.GENERATOR_COUNT_VALUE_HOST);
    @SuppressWarnings("unused") Job job = NutchJob.getInstance(getConf());
    FileSystem fs = FileSystem.get(conf);
    Path dir = new Path(getConf().get("hadoop.tmp.dir"), "bench-" + System.currentTimeMillis());
    fs.mkdirs(dir);
    Path rootUrlDir = new Path(dir, "seed");
    fs.mkdirs(rootUrlDir);
    createSeeds(fs, rootUrlDir, seeds);
    if (LOG.isInfoEnabled()) {
        LOG.info("crawl started in: " + dir);
        LOG.info("rootUrlDir = " + rootUrlDir);
        LOG.info("threads = " + threads);
        LOG.info("depth = " + depth);
    }
    BenchmarkResults res = new BenchmarkResults();
    res.delete = delete;
    res.depth = depth;
    res.plugins = plugins;
    res.seeds = seeds;
    res.threads = threads;
    res.topN = topN;
    Path crawlDb = new Path(dir + "/crawldb");
    Path linkDb = new Path(dir + "/linkdb");
    Path segments = new Path(dir + "/segments");
    res.elapsed = System.currentTimeMillis();
    Injector injector = new Injector(getConf());
    Generator generator = new Generator(getConf());
    Fetcher fetcher = new Fetcher(getConf());
    ParseSegment parseSegment = new ParseSegment(getConf());
    CrawlDb crawlDbTool = new CrawlDb(getConf());
    LinkDb linkDbTool = new LinkDb(getConf());
    // initialize crawlDb
    long start = System.currentTimeMillis();
    injector.inject(crawlDb, rootUrlDir);
    long delta = System.currentTimeMillis() - start;
    res.addTiming("inject", "0", delta);
    int i;
    for (i = 0; i < depth; i++) {
        // generate new segment
        start = System.currentTimeMillis();
        Path[] segs = generator.generate(crawlDb, segments, -1, topN, System.currentTimeMillis());
        delta = System.currentTimeMillis() - start;
        res.addTiming("generate", i + "", delta);
        if (segs == null) {
            LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
            break;
        }
        start = System.currentTimeMillis();
        // fetch it
        fetcher.fetch(segs[0], threads);
        delta = System.currentTimeMillis() - start;
        res.addTiming("fetch", i + "", delta);
        if (!Fetcher.isParsing(conf)) {
            start = System.currentTimeMillis();
            // parse it, if needed
            parseSegment.parse(segs[0]);
            delta = System.currentTimeMillis() - start;
            res.addTiming("parse", i + "", delta);
        }
        start = System.currentTimeMillis();
        // update crawldb
        crawlDbTool.update(crawlDb, segs, true, true);
        delta = System.currentTimeMillis() - start;
        res.addTiming("update", i + "", delta);
        start = System.currentTimeMillis();
        // invert links
        linkDbTool.invert(linkDb, segs, true, true, false);
        delta = System.currentTimeMillis() - start;
        res.addTiming("invert", i + "", delta);
        // delete data
        if (delete) {
            for (Path p : segs) {
                fs.delete(p, true);
            }
        }
    }
    if (i == 0) {
        LOG.warn("No URLs to fetch - check your seed list and URL filters.");
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("crawl finished: " + dir);
    }
    res.elapsed = System.currentTimeMillis() - res.elapsed;
    @SuppressWarnings("resource") CrawlDbReader dbreader = new CrawlDbReader();
    dbreader.processStatJob(crawlDb.toString(), conf, false);
    return res;
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) CrawlDbReader(org.apache.nutch.crawl.CrawlDbReader) ParseSegment(org.apache.nutch.parse.ParseSegment) LinkDb(org.apache.nutch.crawl.LinkDb) Injector(org.apache.nutch.crawl.Injector) FileSystem(org.apache.hadoop.fs.FileSystem) Fetcher(org.apache.nutch.fetcher.Fetcher) NutchJob(org.apache.nutch.util.NutchJob) Job(org.apache.hadoop.mapreduce.Job) CrawlDb(org.apache.nutch.crawl.CrawlDb) Generator(org.apache.nutch.crawl.Generator)

Aggregations

Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 Job (org.apache.hadoop.mapreduce.Job)1 CrawlDb (org.apache.nutch.crawl.CrawlDb)1 CrawlDbReader (org.apache.nutch.crawl.CrawlDbReader)1 Generator (org.apache.nutch.crawl.Generator)1 Injector (org.apache.nutch.crawl.Injector)1 LinkDb (org.apache.nutch.crawl.LinkDb)1 Fetcher (org.apache.nutch.fetcher.Fetcher)1 ParseSegment (org.apache.nutch.parse.ParseSegment)1 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)1 NutchJob (org.apache.nutch.util.NutchJob)1