use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class DepthScoringFilter method updateDbScore.
@Override
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException {
// find a minimum of all depths
int newDepth = DEFAULT_MAX_DEPTH;
if (old != null) {
IntWritable oldDepth = (IntWritable) old.getMetaData().get(DEPTH_KEY_W);
if (oldDepth != null) {
newDepth = oldDepth.get();
} else {
// not set ?
initialScore(url, old);
}
}
for (CrawlDatum lnk : inlinked) {
IntWritable depth = (IntWritable) lnk.getMetaData().get(DEPTH_KEY_W);
if (depth != null && depth.get() < newDepth) {
newDepth = depth.get();
}
}
datum.getMetaData().put(DEPTH_KEY_W, new IntWritable(newDepth));
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestOrphanScoringFilter method testOrphanScoringFilter.
@Test
public void testOrphanScoringFilter() throws Exception {
Configuration conf = NutchConfiguration.create();
conf.setInt("scoring.orphan.mark.gone.after", 5);
conf.setInt("scoring.orphan.mark.orphan.after", 10);
ScoringFilter filter = new OrphanScoringFilter();
filter.setConf(conf);
Text url = new Text("http://nutch.apache.org/");
CrawlDatum datum = new CrawlDatum();
datum.setStatus(CrawlDatum.STATUS_DB_NOTMODIFIED);
List<CrawlDatum> emptyListOfInlinks = new ArrayList<CrawlDatum>();
List<CrawlDatum> populatedListOfInlinks = new ArrayList<CrawlDatum>();
populatedListOfInlinks.add(datum);
// Act as if record has inlinks
filter.updateDbScore(url, null, datum, populatedListOfInlinks);
int firstOrphanTime = getTime(datum);
assertTrue(datum.getMetaData().containsKey(OrphanScoringFilter.ORPHAN_KEY_WRITABLE));
// Wait a little bit
try {
Thread.sleep(1000);
} catch (Exception e) {
}
// Again, this time orphan time must be increased by about 1000 ms
filter.updateDbScore(url, null, datum, populatedListOfInlinks);
int secondOrphanTime = getTime(datum);
assertTrue(secondOrphanTime > firstOrphanTime);
// Act as if no more inlinks, time will not increase, status is still the
// same
filter.updateDbScore(url, null, datum, emptyListOfInlinks);
int thirdOrphanTime = getTime(datum);
assertEquals(thirdOrphanTime, secondOrphanTime);
assertEquals("Expected status db_notmodified but got " + CrawlDatum.getStatusName(datum.getStatus()), CrawlDatum.STATUS_DB_NOTMODIFIED, datum.getStatus());
// Wait a little bit
try {
Thread.sleep(1000);
} catch (Exception e) {
}
// Act as if no more inlinks, time will not increase, status is still the
// same
filter.updateDbScore(url, null, datum, emptyListOfInlinks);
assertEquals("Expected status db_notmodified but got " + CrawlDatum.getStatusName(datum.getStatus()), CrawlDatum.STATUS_DB_NOTMODIFIED, datum.getStatus());
// Wait until mark.gone.after
try {
Thread.sleep(5000);
} catch (Exception e) {
}
// Again, but now markgoneafter has expired and record should be DB_GONE
filter.updateDbScore(url, null, datum, emptyListOfInlinks);
int fourthOrphanTime = getTime(datum);
assertEquals(fourthOrphanTime, thirdOrphanTime);
assertEquals("Expected status db_gone but got " + CrawlDatum.getStatusName(datum.getStatus()), CrawlDatum.STATUS_DB_GONE, datum.getStatus());
// Wait until mark.orphan.after
try {
Thread.sleep(5000);
} catch (Exception e) {
}
// Again, but now markgoneafter has expired and record should be DB_ORPHAN
filter.updateDbScore(url, null, datum, emptyListOfInlinks);
assertEquals("Expected status db_orphan but got " + CrawlDatum.getStatusName(datum.getStatus()), CrawlDatum.STATUS_DB_ORPHAN, datum.getStatus());
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class URLMetaScoringFilter method distributeScoreToOutlinks.
/**
* This will take the metatags that you have listed in your "urlmeta.tags"
* property, and looks for them inside the parseData object. If they exist,
* this will be propagated into your 'targets' Collection's ["outlinks"]
* attributes.
*
* @see ScoringFilter#distributeScoreToOutlinks
*/
public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException {
if (urlMetaTags == null || targets == null || parseData == null)
return adjust;
Iterator<Entry<Text, CrawlDatum>> targetIterator = targets.iterator();
while (targetIterator.hasNext()) {
Entry<Text, CrawlDatum> nextTarget = targetIterator.next();
for (String metatag : urlMetaTags) {
String metaFromParse = parseData.getMeta(metatag);
if (metaFromParse == null)
continue;
nextTarget.getValue().getMetaData().put(new Text(metatag), new Text(metaFromParse));
}
}
return adjust;
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class MimeTypeIndexingFilterTest method testAllowOnlyImages.
@Test
public void testAllowOnlyImages() throws Exception {
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt");
filter.setConf(conf);
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i], new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (MIME_TYPES[i].contains("image")) {
Assert.assertNotNull("Allow only images", doc);
} else {
Assert.assertNull("Block everything else", doc);
}
}
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class MimeTypeIndexingFilterTest method testBlockHTML.
@Test
public void testBlockHTML() throws Exception {
conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt");
filter.setConf(conf);
for (int i = 0; i < parses.length; i++) {
NutchDocument doc = filter.filter(new NutchDocument(), parses[i], new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
if (MIME_TYPES[i].contains("html")) {
Assert.assertNull("Block only HTML documents", doc);
} else {
Assert.assertNotNull("Allow everything else", doc);
}
}
}
Aggregations