use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class DepthScoringFilter method distributeScoreToOutlinks.
@Override
public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException {
String depthString = parseData.getMeta(DEPTH_KEY);
if (depthString == null) {
LOG.warn("Missing depth, removing all outlinks from url " + fromUrl);
targets.clear();
return adjust;
}
int curDepth = Integer.parseInt(depthString);
int curMaxDepth = defaultMaxDepth;
IntWritable customMaxDepth = null;
// allow overrides from injector
String maxDepthString = parseData.getMeta(MAX_DEPTH_KEY);
if (maxDepthString != null) {
curMaxDepth = Integer.parseInt(maxDepthString);
customMaxDepth = new IntWritable(curMaxDepth);
}
if (curDepth >= curMaxDepth) {
// depth exceeded - throw away
LOG.info("Depth limit (" + curMaxDepth + ") reached, ignoring outlinks for " + fromUrl);
targets.clear();
return adjust;
}
Iterator<Entry<Text, CrawlDatum>> it = targets.iterator();
while (it.hasNext()) {
Entry<Text, CrawlDatum> e = it.next();
// record increased depth
e.getValue().getMetaData().put(DEPTH_KEY_W, new IntWritable(curDepth + 1));
// record maxDepth if any
if (customMaxDepth != null) {
e.getValue().getMetaData().put(MAX_DEPTH_KEY_W, customMaxDepth);
}
}
return adjust;
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class OPICScoringFilter method updateDbScore.
/**
* Increase the score by a sum of inlinked scores.
*/
public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException {
float adjust = 0.0f;
for (int i = 0; i < inlinked.size(); i++) {
CrawlDatum linked = inlinked.get(i);
adjust += linked.getScore();
}
if (old == null)
old = datum;
datum.setScore(old.getScore() + adjust);
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class OPICScoringFilter method distributeScoreToOutlinks.
/**
* Get a float value from Fetcher.SCORE_KEY, divide it by the number of
* outlinks and apply.
*/
public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException {
float score = scoreInjected;
String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
if (scoreString != null) {
try {
score = Float.parseFloat(scoreString);
} catch (Exception e) {
LOG.error("Error: ", e);
}
}
int validCount = targets.size();
if (countFiltered) {
score /= allCount;
} else {
if (validCount == 0) {
// no outlinks to distribute score, so just return adjust
return adjust;
}
score /= validCount;
}
// internal and external score factor
float internalScore = score * internalScoreFactor;
float externalScore = score * externalScoreFactor;
for (Entry<Text, CrawlDatum> target : targets) {
try {
String toHost = new URL(target.getKey().toString()).getHost();
String fromHost = new URL(fromUrl.toString()).getHost();
if (toHost.equalsIgnoreCase(fromHost)) {
target.getValue().setScore(internalScore);
} else {
target.getValue().setScore(externalScore);
}
} catch (MalformedURLException e) {
LOG.error("Error: ", e);
target.getValue().setScore(externalScore);
}
}
// XXX linked pages...
return adjust;
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestCrawlDbStates method testCrawlDbStatTransitionInject.
/**
* Test states after inject: inject must not modify the status of CrawlDatums
* already in CrawlDb. Newly injected elements have status "db_unfetched".
* Inject is simulated by calling {@link Injector.InjectReducer#reduce()}.
*/
@Test
public void testCrawlDbStatTransitionInject() {
LOG.info("Test CrawlDatum states in Injector after inject");
Configuration conf = CrawlDBTestUtil.createContext().getConfiguration();
Injector.InjectReducer injector = new Injector.InjectReducer();
CrawlDbUpdateTestDriver<Injector.InjectReducer> injectDriver = new CrawlDbUpdateTestDriver<Injector.InjectReducer>(injector, conf);
ScoringFilters scfilters = new ScoringFilters(conf);
for (String sched : schedules) {
LOG.info("Testing inject with " + sched);
conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
FetchSchedule schedule = FetchScheduleFactory.getFetchSchedule(conf);
List<CrawlDatum> values = new ArrayList<CrawlDatum>();
for (int i = 0; i < fetchDbStatusPairs.length; i++) {
byte fromDbStatus = fetchDbStatusPairs[i][1];
byte toDbStatus = fromDbStatus;
if (fromDbStatus == -1) {
toDbStatus = STATUS_DB_UNFETCHED;
} else {
CrawlDatum fromDb = new CrawlDatum();
fromDb.setStatus(fromDbStatus);
schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
values.add(fromDb);
}
LOG.info("inject " + (fromDbStatus == -1 ? "<not in CrawlDb>" : CrawlDatum.getStatusName(fromDbStatus)) + " + " + getStatusName(STATUS_INJECTED) + " => " + getStatusName(toDbStatus));
CrawlDatum injected = new CrawlDatum(STATUS_INJECTED, conf.getInt("db.fetch.interval.default", 2592000), 0.1f);
schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, injected);
try {
scfilters.injectedScore(CrawlDbUpdateUtil.dummyURL, injected);
} catch (ScoringFilterException e) {
LOG.error(StringUtils.stringifyException(e));
}
values.add(injected);
List<CrawlDatum> res = injectDriver.update(values);
if (res.size() != 1) {
fail("Inject didn't result in one single CrawlDatum per URL");
continue;
}
byte status = res.get(0).getStatus();
if (status != toDbStatus) {
fail("Inject for " + (fromDbStatus == -1 ? "" : getStatusName(fromDbStatus) + " and ") + getStatusName(STATUS_INJECTED) + " results in " + getStatusName(status) + " (expected: " + getStatusName(toDbStatus) + ")");
}
values.clear();
}
}
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestIndexingFilters method testNutchDocumentNullIndexingFilter.
/**
* Test behaviour when NutchDOcument is null
*/
@Test
public void testNutchDocumentNullIndexingFilter() throws IndexingException {
Configuration conf = NutchConfiguration.create();
conf.addResource("nutch-default.xml");
conf.addResource("crawl-tests.xml");
IndexingFilters filters = new IndexingFilters(conf);
NutchDocument doc = filters.filter(null, new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], new Metadata())), new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks());
Assert.assertNull(doc);
}
Aggregations