use of org.apache.nutch.util.domain.DomainSuffix in project nutch by apache.
the class DomainBlacklistURLFilter method filter.
public String filter(String url) {
try {
// match for suffix, domain, and host in that order. more general will
// override more specific
String domain = URLUtil.getDomainName(url).toLowerCase().trim();
String host = URLUtil.getHost(url);
String suffix = null;
DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
if (domainSuffix != null) {
suffix = domainSuffix.getDomain();
}
if (domainSet.contains(suffix) || domainSet.contains(domain) || domainSet.contains(host)) {
// Matches, filter!
return null;
}
// doesn't match, allow
return url;
} catch (Exception e) {
// if an error happens, allow the url to pass
LOG.error("Could not apply filter on url: " + url + "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
return null;
}
}
use of org.apache.nutch.util.domain.DomainSuffix in project nutch by apache.
the class URLUtil method getDomainSuffix.
/**
* Returns the {@link DomainSuffix} corresponding to the last public part of
* the hostname
* @param url a {@link URL} to extract the domain suffix from
* @return a {@link org.apache.nutch.util.domain.DomainSuffix}
*/
public static DomainSuffix getDomainSuffix(URL url) {
DomainSuffixes tlds = DomainSuffixes.getInstance();
String host = url.getHost();
if (IP_PATTERN.matcher(host).matches())
return null;
int index = 0;
String candidate = host;
for (; index >= 0; ) {
index = candidate.indexOf('.');
String subCandidate = candidate.substring(index + 1);
DomainSuffix d = tlds.get(subCandidate);
if (d != null) {
return d;
}
candidate = subCandidate;
}
return null;
}
use of org.apache.nutch.util.domain.DomainSuffix in project nutch by apache.
the class TLDScoringFilter method indexerScore.
@Override
public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException {
NutchField tlds = doc.getField("tld");
float boost = 1.0f;
if (tlds != null) {
for (Object tld : tlds.getValues()) {
DomainSuffix entry = tldEntries.get(tld.toString());
if (entry != null)
boost *= entry.getBoost();
}
}
return initScore * boost;
}
use of org.apache.nutch.util.domain.DomainSuffix in project nutch by apache.
the class DomainURLFilter method filter.
@Override
public String filter(String url) {
// https://issues.apache.org/jira/browse/NUTCH-2189
if (domainSet.size() == 0)
return url;
try {
// match for suffix, domain, and host in that order. more general will
// override more specific
String domain = URLUtil.getDomainName(url).toLowerCase().trim();
String host = URLUtil.getHost(url);
String suffix = null;
DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
if (domainSuffix != null) {
suffix = domainSuffix.getDomain();
}
if (domainSet.contains(suffix) || domainSet.contains(domain) || domainSet.contains(host)) {
return url;
}
// doesn't match, don't allow
return null;
} catch (Exception e) {
// if an error happens, allow the url to pass
LOG.error("Could not apply filter on url: " + url + "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
return null;
}
}
use of org.apache.nutch.util.domain.DomainSuffix in project nutch by apache.
the class TLDIndexingFilter method filter.
@Override
public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText, CrawlDatum datum, Inlinks inlinks) throws IndexingException {
try {
URL url = new URL(urlText.toString());
DomainSuffix d = URLUtil.getDomainSuffix(url);
doc.add("tld", d.getDomain());
} catch (Exception ex) {
LOG.warn(ex.toString());
}
return doc;
}
Aggregations