use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class MimeAdaptiveFetchSchedule method main.
public static void main(String[] args) throws Exception {
FetchSchedule fs = new MimeAdaptiveFetchSchedule();
fs.setConf(NutchConfiguration.create());
// we start the time at 0, for simplicity
long curTime = 0;
// 2 hours
long delta = 1000L * 3600L * 24L;
// we trigger the update of the page every 30 days
// 30 days
long update = 1000L * 3600L * 24L * 30L;
boolean changed = true;
long lastModified = 0;
int miss = 0;
int totalMiss = 0;
int maxMiss = 0;
int fetchCnt = 0;
int changeCnt = 0;
// initial fetchInterval is 10 days
CrawlDatum p = new CrawlDatum(1, 3600 * 24 * 30, 1.0f);
// Set a default MIME-type to test with
org.apache.hadoop.io.MapWritable x = new org.apache.hadoop.io.MapWritable();
x.put(HttpHeaders.WRITABLE_CONTENT_TYPE, new Text("text/html; charset=utf-8"));
p.setMetaData(x);
p.setFetchTime(0);
LOG.info(p.toString());
// let's move the timeline a couple of deltas
for (int i = 0; i < 10000; i++) {
if (lastModified + update < curTime) {
// System.out.println("i=" + i + ", lastModified=" + lastModified +
// ", update=" + update + ", curTime=" + curTime);
changed = true;
changeCnt++;
lastModified = curTime;
}
LOG.info(i + ". " + changed + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY) + " days" + "\t missed " + miss);
if (p.getFetchTime() <= curTime) {
fetchCnt++;
fs.setFetchSchedule(new Text("http://www.example.com"), p, p.getFetchTime(), p.getModifiedTime(), curTime, lastModified, changed ? FetchSchedule.STATUS_MODIFIED : FetchSchedule.STATUS_NOTMODIFIED);
LOG.info("\tfetched & adjusted: " + "\twill fetch at " + (p.getFetchTime() / delta) + "\tinterval " + (p.getFetchInterval() / SECONDS_PER_DAY) + " days");
if (!changed)
miss++;
if (miss > maxMiss)
maxMiss = miss;
changed = false;
totalMiss += miss;
miss = 0;
}
if (changed)
miss++;
curTime += delta;
}
LOG.info("Total missed: " + totalMiss + ", max miss: " + maxMiss);
LOG.info("Page changed " + changeCnt + " times, fetched " + fetchCnt + " times.");
}
Aggregations