use of org.apache.nutch.scoring.ScoringFilterException in project nutch by apache.
the class FetcherThread method handleRedirect.
private Text handleRedirect(Text url, CrawlDatum datum, String urlString, String newUrl, boolean temp, String redirType) throws MalformedURLException, URLFilterException, InterruptedException {
newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER);
newUrl = urlFilters.filter(newUrl);
if (newUrl == null || newUrl.equals(urlString)) {
LOG.debug(" - {} redirect skipped: {}", redirType, (newUrl != null ? "to same url" : "filtered"));
return null;
if (ignoreAlsoRedirects && (ignoreExternalLinks || ignoreInternalLinks)) {
try {
URL origUrl = new URL(urlString);
URL redirUrl = new URL(newUrl);
if (ignoreExternalLinks) {
String origHostOrDomain, newHostOrDomain;
if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
origHostOrDomain = URLUtil.getDomainName(origUrl).toLowerCase();
newHostOrDomain = URLUtil.getDomainName(redirUrl).toLowerCase();
} else {
// byHost
origHostOrDomain = origUrl.getHost().toLowerCase();
newHostOrDomain = redirUrl.getHost().toLowerCase();
if (!origHostOrDomain.equals(newHostOrDomain)) {
LOG.debug(" - ignoring redirect {} from {} to {} because external links are ignored", redirType, urlString, newUrl);
return null;
if (ignoreInternalLinks) {
String origHost = origUrl.getHost().toLowerCase();
String newHost = redirUrl.getHost().toLowerCase();
if (origHost.equals(newHost)) {
LOG.debug(" - ignoring redirect {} from {} to {} because internal links are ignored", redirType, urlString, newUrl);
return null;
} catch (MalformedURLException e) {
return null;
reprUrl = URLUtil.chooseRepr(reprUrl, newUrl, temp);
url = new Text(newUrl);
if (maxRedirect > 0) {
redirecting = true;
LOG.debug(" - {} redirect to {} (fetching now)", redirType, url);
return url;
} else {
CrawlDatum newDatum = new CrawlDatum(CrawlDatum.STATUS_LINKED, datum.getFetchInterval(), datum.getScore());
// transfer existing metadata
try {
scfilters.initialScore(url, newDatum);
} catch (ScoringFilterException e) {
if (reprUrl != null) {
newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
output(url, newDatum, null, null, CrawlDatum.STATUS_LINKED);
LOG.debug(" - {} redirect to {} (fetching later)", redirType, url);
return null;
the class FetcherThread method createRedirDatum.
private CrawlDatum createRedirDatum(Text redirUrl, FetchItem fit, byte status) {
CrawlDatum newDatum = new CrawlDatum(status, fit.datum.getFetchInterval(), fit.datum.getScore());
// transfer existing metadata
try {
scfilters.initialScore(redirUrl, newDatum);
} catch (ScoringFilterException e) {
LOG.error("Scoring filtering failed for {}: ", redirUrl, e);
if (reprUrl != null) {
newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
return newDatum;
the class ParseOutputFormat method getRecordWriter.
public RecordWriter<Text, Parse> getRecordWriter(TaskAttemptContext context) throws IOException {
Configuration conf = context.getConfiguration();
String name = getUniqueFile(context, "part");
Path dir = FileOutputFormat.getOutputPath(context);
FileSystem fs = dir.getFileSystem(context.getConfiguration());
if (conf.getBoolean("parse.filter.urls", true)) {
filters = new URLFilters(conf);
exemptionFilters = new URLExemptionFilters(conf);
if (conf.getBoolean("parse.normalize.urls", true)) {
normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_OUTLINK);
this.scfilters = new ScoringFilters(conf);
final int interval = conf.getInt("db.fetch.interval.default", 2592000);
final boolean ignoreInternalLinks = conf.getBoolean("db.ignore.internal.links", false);
final boolean ignoreExternalLinks = conf.getBoolean("db.ignore.external.links", false);
final String ignoreExternalLinksMode = conf.get("db.ignore.external.links.mode", "byHost");
// NUTCH-2435 - parameter "" allowing to choose whether to
// store 'parse_text' directory or not:
final boolean storeText = conf.getBoolean("", true);
int maxOutlinksPerPage = conf.getInt("", 100);
final int maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage;
int maxOutlinkL = conf.getInt("db.max.outlink.length", 4096);
final int maxOutlinkLength = (maxOutlinkL < 0) ? Integer.MAX_VALUE : maxOutlinkL;
final boolean isParsing = conf.getBoolean("fetcher.parse", true);
final CompressionType compType = SequenceFileOutputFormat.getOutputCompressionType(context);
Path out = FileOutputFormat.getOutputPath(context);
Path text = new Path(new Path(out, ParseText.DIR_NAME), name);
Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
final String[] parseMDtoCrawlDB = conf.get("", "").split(" *, *");
// textOut Options
final MapFile.Writer textOut;
if (storeText) {
Option tKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class); tValClassOpt = SequenceFile.Writer.valueClass(ParseText.class); tProgressOpt = SequenceFile.Writer.progressable((Progressable) context); tCompOpt = SequenceFile.Writer.compression(CompressionType.RECORD);
textOut = new MapFile.Writer(conf, text, tKeyClassOpt, tValClassOpt, tCompOpt, tProgressOpt);
} else {
textOut = null;
// dataOut Options
Option dKeyClassOpt = (Option) MapFile.Writer.keyClass(Text.class); dValClassOpt = SequenceFile.Writer.valueClass(ParseData.class); dProgressOpt = SequenceFile.Writer.progressable((Progressable) context); dCompOpt = SequenceFile.Writer.compression(compType);
final MapFile.Writer dataOut = new MapFile.Writer(conf, data, dKeyClassOpt, dValClassOpt, dCompOpt, dProgressOpt);
final SequenceFile.Writer crawlOut = SequenceFile.createWriter(conf, SequenceFile.Writer.file(crawl), SequenceFile.Writer.keyClass(Text.class), SequenceFile.Writer.valueClass(CrawlDatum.class), SequenceFile.Writer.bufferSize(fs.getConf().getInt("io.file.buffer.size", 4096)), SequenceFile.Writer.replication(fs.getDefaultReplication(crawl)), SequenceFile.Writer.blockSize(1073741824), SequenceFile.Writer.compression(compType, new DefaultCodec()), SequenceFile.Writer.progressable((Progressable) context), SequenceFile.Writer.metadata(new Metadata()));
return new RecordWriter<Text, Parse>() {
public void write(Text key, Parse parse) throws IOException {
String fromUrl = key.toString();
// host or domain name of the source URL
String origin = null;
if (textOut != null) {
textOut.append(key, new ParseText(parse.getText()));
ParseData parseData = parse.getData();
// recover the signature prepared by Fetcher or ParseSegment
String sig = parseData.getContentMeta().get(Nutch.SIGNATURE_KEY);
if (sig != null) {
byte[] signature = StringUtil.fromHexString(sig);
if (signature != null) {
// append a CrawlDatum with a signature
CrawlDatum d = new CrawlDatum(CrawlDatum.STATUS_SIGNATURE, 0);
crawlOut.append(key, d);
// see if the parse metadata contain things that we'd like
// to pass to the metadata of the crawlDB entry
CrawlDatum parseMDCrawlDatum = null;
for (String mdname : parseMDtoCrawlDB) {
String mdvalue = parse.getData().getParseMeta().get(mdname);
if (mdvalue != null) {
if (parseMDCrawlDatum == null)
parseMDCrawlDatum = new CrawlDatum(CrawlDatum.STATUS_PARSE_META, 0);
parseMDCrawlDatum.getMetaData().put(new Text(mdname), new Text(mdvalue));
if (parseMDCrawlDatum != null)
crawlOut.append(key, parseMDCrawlDatum);
// need to determine origin (once for all outlinks)
if (ignoreExternalLinks || ignoreInternalLinks) {
URL originURL = new URL(fromUrl.toString());
// based on domain?
if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
origin = URLUtil.getDomainName(originURL).toLowerCase();
} else // use host
origin = originURL.getHost().toLowerCase();
ParseStatus pstatus = parseData.getStatus();
if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
int refreshTime = Integer.parseInt(pstatus.getArgs()[1]);
newUrl = filterNormalize(fromUrl, newUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers, URLNormalizers.SCOPE_FETCHER);
if (newUrl != null) {
String reprUrl = URLUtil.chooseRepr(fromUrl, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME);
CrawlDatum newDatum = new CrawlDatum();
if (reprUrl != null && !reprUrl.equals(newUrl)) {
newDatum.getMetaData().put(Nutch.WRITABLE_REPR_URL_KEY, new Text(reprUrl));
crawlOut.append(new Text(newUrl), newDatum);
// collect outlinks for subsequent db update
Outlink[] links = parseData.getOutlinks();
int outlinksToStore = Math.min(maxOutlinks, links.length);
int validCount = 0;
CrawlDatum adjust = null;
List<Entry<Text, CrawlDatum>> targets = new ArrayList<>(outlinksToStore);
List<Outlink> outlinkList = new ArrayList<>(outlinksToStore);
for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
String toUrl = links[i].getToUrl();
// only normalize and filter if fetcher.parse = false
if (!isParsing) {
if (toUrl.length() > maxOutlinkLength) {
toUrl = ParseOutputFormat.filterNormalize(fromUrl, toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, filters, exemptionFilters, normalizers);
if (toUrl == null) {
CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval);
Text targetUrl = new Text(toUrl);
// see if the outlink has any metadata attached
// and if so pass that to the crawldatum so that
// the initial score or distribution can use that
MapWritable outlinkMD = links[i].getMetadata();
if (outlinkMD != null) {
try {
scfilters.initialScore(targetUrl, target);
} catch (ScoringFilterException e) {
LOG.warn("Cannot filter init score for url " + key + ", using default: " + e.getMessage());
targets.add(new SimpleEntry(targetUrl, target));
// overwrite URL in Outlink object with normalized URL (NUTCH-1174)
try {
// compute score contributions and adjustment to the original score
adjust = scfilters.distributeScoreToOutlinks(key, parseData, targets, null, links.length);
} catch (ScoringFilterException e) {
LOG.warn("Cannot distribute score from " + key + ": " + e.getMessage());
for (Entry<Text, CrawlDatum> target : targets) {
crawlOut.append(target.getKey(), target.getValue());
if (adjust != null)
crawlOut.append(key, adjust);
Outlink[] filteredLinks = outlinkList.toArray(new Outlink[outlinkList.size()]);
parseData = new ParseData(parseData.getStatus(), parseData.getTitle(), filteredLinks, parseData.getContentMeta(), parseData.getParseMeta());
dataOut.append(key, parseData);
if (!parse.isCanonical()) {
CrawlDatum datum = new CrawlDatum();
String timeString = parse.getData().getContentMeta().get(Nutch.FETCH_TIME_KEY);
try {
} catch (Exception e) {
LOG.warn("Can't read fetch time for: " + key);
crawlOut.append(key, datum);
public void close(TaskAttemptContext context) throws IOException {
if (textOut != null)
the class OPICScoringFilter method distributeScoreToOutlinks.
* Get a float value from Fetcher.SCORE_KEY, divide it by the number of
* outlinks and apply.
public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException {
float score = scoreInjected;
String scoreString = parseData.getContentMeta().get(Nutch.SCORE_KEY);
if (scoreString != null) {
try {
score = Float.parseFloat(scoreString);
} catch (Exception e) {
LOG.error("Error: ", e);
int validCount = targets.size();
if (countFiltered) {
score /= allCount;
} else {
if (validCount == 0) {
// no outlinks to distribute score, so just return adjust
return adjust;
score /= validCount;
// internal and external score factor
float internalScore = score * internalScoreFactor;
float externalScore = score * externalScoreFactor;
for (Entry<Text, CrawlDatum> target : targets) {
try {
String toHost = new URL(target.getKey().toString()).getHost();
String fromHost = new URL(fromUrl.toString()).getHost();
if (toHost.equalsIgnoreCase(fromHost)) {
} else {
} catch (MalformedURLException e) {
LOG.error("Error: ", e);
// XXX linked pages...
return adjust;
the class TestCrawlDbStates method testCrawlDbStatTransitionInject.
* Test states after inject: inject must not modify the status of CrawlDatums
* already in CrawlDb. Newly injected elements have status "db_unfetched".
* Inject is simulated by calling {@link Injector.InjectReducer#reduce()}.
public void testCrawlDbStatTransitionInject() {"Test CrawlDatum states in Injector after inject");
Configuration conf = CrawlDBTestUtil.createContext().getConfiguration();
Injector.InjectReducer injector = new Injector.InjectReducer();
CrawlDbUpdateTestDriver<Injector.InjectReducer> injectDriver = new CrawlDbUpdateTestDriver<Injector.InjectReducer>(injector, conf);
ScoringFilters scfilters = new ScoringFilters(conf);
for (String sched : schedules) {"Testing inject with " + sched);
conf.set("db.fetch.schedule.class", "org.apache.nutch.crawl." + sched);
FetchSchedule schedule = FetchScheduleFactory.getFetchSchedule(conf);
List<CrawlDatum> values = new ArrayList<CrawlDatum>();
for (int i = 0; i < fetchDbStatusPairs.length; i++) {
byte fromDbStatus = fetchDbStatusPairs[i][1];
byte toDbStatus = fromDbStatus;
if (fromDbStatus == -1) {
} else {
CrawlDatum fromDb = new CrawlDatum();
schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, fromDb);
}"inject " + (fromDbStatus == -1 ? "<not in CrawlDb>" : CrawlDatum.getStatusName(fromDbStatus)) + " + " + getStatusName(STATUS_INJECTED) + " => " + getStatusName(toDbStatus));
CrawlDatum injected = new CrawlDatum(STATUS_INJECTED, conf.getInt("db.fetch.interval.default", 2592000), 0.1f);
schedule.initializeSchedule(CrawlDbUpdateUtil.dummyURL, injected);
try {
scfilters.injectedScore(CrawlDbUpdateUtil.dummyURL, injected);
} catch (ScoringFilterException e) {
List<CrawlDatum> res = injectDriver.update(values);
if (res.size() != 1) {
fail("Inject didn't result in one single CrawlDatum per URL");
byte status = res.get(0).getStatus();
if (status != toDbStatus) {
fail("Inject for " + (fromDbStatus == -1 ? "" : getStatusName(fromDbStatus) + " and ") + getStatusName(STATUS_INJECTED) + " results in " + getStatusName(status) + " (expected: " + getStatusName(toDbStatus) + ")");