use of org.apache.nutch.scoring.ScoringFilter in project nutch by apache.
the class TestOrphanScoringFilter method testOrphanScoringFilter.
@Test
public void testOrphanScoringFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.setInt("scoring.orphan.mark.gone.after", 5);
conf.setInt("scoring.orphan.mark.orphan.after", 10);
ScoringFilter filter = new OrphanScoringFilter();
filter.setConf(conf);
Text url = new Text("http://nutch.apache.org/");
CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_DB_NOTMODIFIED);
List<CrawlDatum> emptyListOfInlinks = new ArrayList<CrawlDatum>();
List<CrawlDatum> populatedListOfInlinks = new ArrayList<CrawlDatum>();
populatedListOfInlinks.add(datum);
// Act as if record has inlinks
filter.updateDbScore(url, null, datum, populatedListOfInlinks);
int firstOrphanTime = getTime(datum);
assertTrue(datum.getMetaData().containsKey(OrphanScoringFilter.ORPHAN_KEY_WRITABLE));
// Wait a little bit
try {
Thread.sleep(1000);
} catch (Exception e) {
}
// Again, this time orphan time must be increased by about 1000 ms
filter.updateDbScore(url, null, datum, populatedListOfInlinks);
int secondOrphanTime = getTime(datum);
assertTrue(secondOrphanTime > firstOrphanTime);
// Act as if no more inlinks, time will not increase, status is still the
// same
filter.updateDbScore(url, null, datum, emptyListOfInlinks);
int thirdOrphanTime = getTime(datum);
assertEquals(thirdOrphanTime, secondOrphanTime);
assertEquals("Expected status db_notmodified but got " + CrawlDatum.getStatusName(datum.getStatus()), CrawlDatum.STATUS_DB_NOTMODIFIED, datum.getStatus());
// Wait a little bit
try {
Thread.sleep(1000);
} catch (Exception e) {
}
// Act as if no more inlinks, time will not increase, status is still the
// same
filter.updateDbScore(url, null, datum, emptyListOfInlinks);
assertEquals("Expected status db_notmodified but got " + CrawlDatum.getStatusName(datum.getStatus()), CrawlDatum.STATUS_DB_NOTMODIFIED, datum.getStatus());
// Wait until mark.gone.after
try {
Thread.sleep(5000);
} catch (Exception e) {
}
// Again, but now markgoneafter has expired and record should be DB_GONE
filter.updateDbScore(url, null, datum, emptyListOfInlinks);
int fourthOrphanTime = getTime(datum);
assertEquals(fourthOrphanTime, thirdOrphanTime);
assertEquals("Expected status db_gone but got " + CrawlDatum.getStatusName(datum.getStatus()), CrawlDatum.STATUS_DB_GONE, datum.getStatus());
// Wait until mark.orphan.after
try {
Thread.sleep(5000);
} catch (Exception e) {
}
// Again, but now markgoneafter has expired and record should be DB_ORPHAN
filter.updateDbScore(url, null, datum, emptyListOfInlinks);
assertEquals("Expected status db_orphan but got " + CrawlDatum.getStatusName(datum.getStatus()), CrawlDatum.STATUS_DB_ORPHAN, datum.getStatus());
}
Aggregations