use of org.apache.nutch.crawl.Injector in project nutch by apache.
the class TestFetcher method testFetch.
@Test
public void testFetch() throws IOException, ClassNotFoundException, InterruptedException {
// generate seedlist
ArrayList<String> urls = new ArrayList<String>();
addUrl(urls, "index.html");
addUrl(urls, "pagea.html");
addUrl(urls, "pageb.html");
addUrl(urls, "dup_of_pagea.html");
addUrl(urls, "nested_spider_trap.html");
addUrl(urls, "exception.html");
CrawlDBTestUtil.generateSeedList(fs, urlPath, urls);
// inject
Injector injector = new Injector(conf);
injector.inject(crawldbPath, urlPath);
// generate
Generator g = new Generator(conf);
Path[] generatedSegment = g.generate(crawldbPath, segmentsPath, 1, Long.MAX_VALUE, Long.MAX_VALUE, false, false);
long time = System.currentTimeMillis();
// fetch
Fetcher fetcher = new Fetcher(conf);
// Set fetcher.parse to true
conf.setBoolean("fetcher.parse", true);
fetcher.fetch(generatedSegment[0], 1);
time = System.currentTimeMillis() - time;
// verify politeness, time taken should be more than (num_of_pages +1)*delay
int minimumTime = (int) ((urls.size() + 1) * 1000 * conf.getFloat("fetcher.server.delay", 5));
Assert.assertTrue(time > minimumTime);
// verify content
Path content = new Path(new Path(generatedSegment[0], Content.DIR_NAME), "part-r-00000/data");
@SuppressWarnings("resource") SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(content));
ArrayList<String> handledurls = new ArrayList<String>();
READ_CONTENT: do {
Text key = new Text();
Content value = new Content();
if (!reader.next(key, value))
break READ_CONTENT;
String contentString = new String(value.getContent());
if (contentString.indexOf("Nutch fetcher test page") != -1) {
handledurls.add(key.toString());
}
} while (true);
reader.close();
Collections.sort(urls);
Collections.sort(handledurls);
// verify that enough pages were handled
Assert.assertEquals(urls.size(), handledurls.size());
// verify that correct pages were handled
Assert.assertTrue(handledurls.containsAll(urls));
Assert.assertTrue(urls.containsAll(handledurls));
handledurls.clear();
// verify parse data
Path parseData = new Path(new Path(generatedSegment[0], ParseData.DIR_NAME), "part-r-00000/data");
reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(parseData));
READ_PARSE_DATA: do {
Text key = new Text();
ParseData value = new ParseData();
if (!reader.next(key, value))
break READ_PARSE_DATA;
// make sure they all contain "nutch.segment.name" and
// "nutch.content.digest"
// keys in parse metadata
Metadata contentMeta = value.getContentMeta();
if (contentMeta.get(Nutch.SEGMENT_NAME_KEY) != null && contentMeta.get(Nutch.SIGNATURE_KEY) != null) {
handledurls.add(key.toString());
}
} while (true);
Collections.sort(handledurls);
Assert.assertEquals(urls.size(), handledurls.size());
Assert.assertTrue(handledurls.containsAll(urls));
Assert.assertTrue(urls.containsAll(handledurls));
}
use of org.apache.nutch.crawl.Injector in project nutch by apache.
the class Benchmark method benchmark.
public BenchmarkResults benchmark(int seeds, int depth, int threads, int maxPerHost, long topN, boolean delete, String plugins) throws Exception {
Configuration conf = getConf();
conf.set("http.proxy.host", "localhost");
conf.setInt("http.proxy.port", 8181);
conf.set("http.agent.name", "test");
conf.set("http.robots.agents", "test,*");
if (!plugins.equals("default")) {
conf.set("plugin.includes", plugins);
}
conf.setInt(Generator.GENERATOR_MAX_COUNT, maxPerHost);
conf.set(Generator.GENERATOR_COUNT_MODE, Generator.GENERATOR_COUNT_VALUE_HOST);
@SuppressWarnings("unused") Job job = NutchJob.getInstance(getConf());
FileSystem fs = FileSystem.get(conf);
Path dir = new Path(getConf().get("hadoop.tmp.dir"), "bench-" + System.currentTimeMillis());
fs.mkdirs(dir);
Path rootUrlDir = new Path(dir, "seed");
fs.mkdirs(rootUrlDir);
createSeeds(fs, rootUrlDir, seeds);
if (LOG.isInfoEnabled()) {
LOG.info("crawl started in: " + dir);
LOG.info("rootUrlDir = " + rootUrlDir);
LOG.info("threads = " + threads);
LOG.info("depth = " + depth);
}
BenchmarkResults res = new BenchmarkResults();
res.delete = delete;
res.depth = depth;
res.plugins = plugins;
res.seeds = seeds;
res.threads = threads;
res.topN = topN;
Path crawlDb = new Path(dir + "/crawldb");
Path linkDb = new Path(dir + "/linkdb");
Path segments = new Path(dir + "/segments");
res.elapsed = System.currentTimeMillis();
Injector injector = new Injector(getConf());
Generator generator = new Generator(getConf());
Fetcher fetcher = new Fetcher(getConf());
ParseSegment parseSegment = new ParseSegment(getConf());
CrawlDb crawlDbTool = new CrawlDb(getConf());
LinkDb linkDbTool = new LinkDb(getConf());
// initialize crawlDb
long start = System.currentTimeMillis();
injector.inject(crawlDb, rootUrlDir);
long delta = System.currentTimeMillis() - start;
res.addTiming("inject", "0", delta);
int i;
for (i = 0; i < depth; i++) {
// generate new segment
start = System.currentTimeMillis();
Path[] segs = generator.generate(crawlDb, segments, -1, topN, System.currentTimeMillis());
delta = System.currentTimeMillis() - start;
res.addTiming("generate", i + "", delta);
if (segs == null) {
LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
break;
}
start = System.currentTimeMillis();
// fetch it
fetcher.fetch(segs[0], threads);
delta = System.currentTimeMillis() - start;
res.addTiming("fetch", i + "", delta);
if (!Fetcher.isParsing(conf)) {
start = System.currentTimeMillis();
// parse it, if needed
parseSegment.parse(segs[0]);
delta = System.currentTimeMillis() - start;
res.addTiming("parse", i + "", delta);
}
start = System.currentTimeMillis();
// update crawldb
crawlDbTool.update(crawlDb, segs, true, true);
delta = System.currentTimeMillis() - start;
res.addTiming("update", i + "", delta);
start = System.currentTimeMillis();
// invert links
linkDbTool.invert(linkDb, segs, true, true, false);
delta = System.currentTimeMillis() - start;
res.addTiming("invert", i + "", delta);
// delete data
if (delete) {
for (Path p : segs) {
fs.delete(p, true);
}
}
}
if (i == 0) {
LOG.warn("No URLs to fetch - check your seed list and URL filters.");
}
if (LOG.isInfoEnabled()) {
LOG.info("crawl finished: " + dir);
}
res.elapsed = System.currentTimeMillis() - res.elapsed;
@SuppressWarnings("resource") CrawlDbReader dbreader = new CrawlDbReader();
dbreader.processStatJob(crawlDb.toString(), conf, false);
return res;
}
Aggregations