use of org.apache.nutch.util.domain.DomainSuffixes in project nutch by apache.
the class URLUtil method getDomainName.
/**
* Returns the domain name of the url. The domain name of a url is the
* substring of the url's hostname, w/o subdomain names. As an example <br>
* <code>
* getDomainName(conf, new URL(http://lucene.apache.org/))
* </code><br>
* will return <br>
* <code> apache.org</code>
*/
public static String getDomainName(URL url) {
DomainSuffixes tlds = DomainSuffixes.getInstance();
String host = url.getHost();
// it seems that java returns hostnames ending with .
if (host.endsWith("."))
host = host.substring(0, host.length() - 1);
if (IP_PATTERN.matcher(host).matches())
return host;
int index = 0;
String candidate = host;
for (; index >= 0; ) {
index = candidate.indexOf('.');
String subCandidate = candidate.substring(index + 1);
if (tlds.isDomainSuffix(subCandidate)) {
return candidate;
}
candidate = subCandidate;
}
return candidate;
}
use of org.apache.nutch.util.domain.DomainSuffixes in project nutch by apache.
the class URLUtil method getDomainSuffix.
/**
* Returns the {@link DomainSuffix} corresponding to the last public part of
* the hostname
*/
public static DomainSuffix getDomainSuffix(URL url) {
DomainSuffixes tlds = DomainSuffixes.getInstance();
String host = url.getHost();
if (IP_PATTERN.matcher(host).matches())
return null;
int index = 0;
String candidate = host;
for (; index >= 0; ) {
index = candidate.indexOf('.');
String subCandidate = candidate.substring(index + 1);
DomainSuffix d = tlds.get(subCandidate);
if (d != null) {
return d;
}
candidate = subCandidate;
}
return null;
}
Aggregations