use of crawlercommons.robots.BaseRobotRules in project nutch by apache.
the class HttpRobotRulesParser method getRobotRulesSet.
/**
* Get the rules from robots.txt which applies for the given {@code url}.
* Robot rules are cached for a unique combination of host, protocol, and
* port. If no rules are found in the cache, a HTTP request is send to fetch
* {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
* rules are cached to avoid re-fetching and re-parsing it again.
*
* @param http
* The {@link Protocol} object
* @param url
* URL
* @param robotsTxtContent
* container to store responses when fetching the robots.txt file for
* debugging or archival purposes. Instead of a robots.txt file, it
* may include redirects or an error page (404, etc.). Response
* {@link Content} is appended to the passed list. If null is passed
* nothing is stored.
*
* @return robotRules A {@link BaseRobotRules} object for the rules
*/
@Override
public BaseRobotRules getRobotRulesSet(Protocol http, URL url, List<Content> robotsTxtContent) {
if (LOG.isTraceEnabled() && isWhiteListed(url)) {
LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
}
String cacheKey = getCacheKey(url);
BaseRobotRules robotRules = CACHE.get(cacheKey);
if (robotRules != null) {
// cached rule
return robotRules;
} else if (LOG.isTraceEnabled()) {
LOG.trace("cache miss " + url);
}
boolean cacheRule = true;
URL redir = null;
if (isWhiteListed(url)) {
// check in advance whether a host is whitelisted
// (we do not need to fetch robots.txt)
robotRules = EMPTY_RULES;
LOG.info("Whitelisted host found for: {}", url);
LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", url.getHost());
} else {
try {
URL robotsUrl = new URL(url, "/robots.txt");
Response response = ((HttpBase) http).getResponse(robotsUrl, new CrawlDatum(), true);
if (robotsTxtContent != null) {
addRobotsContent(robotsTxtContent, robotsUrl, response);
}
// try one level of redirection ?
if (response.getCode() == 301 || response.getCode() == 302) {
String redirection = response.getHeader("Location");
if (redirection == null) {
// some versions of MS IIS are known to mangle this header
redirection = response.getHeader("location");
}
if (redirection != null) {
if (!redirection.startsWith("http")) {
// RFC says it should be absolute, but apparently it isn't
redir = new URL(url, redirection);
} else {
redir = new URL(redirection);
}
response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), true);
if (robotsTxtContent != null) {
addRobotsContent(robotsTxtContent, redir, response);
}
}
}
if (// found rules: parse them
response.getCode() == 200)
robotRules = parseRules(url.toString(), response.getContent(), response.getHeader("Content-Type"), agentNames);
else if ((response.getCode() == 403) && (!allowForbidden))
// use forbid all
robotRules = FORBID_ALL_RULES;
else if (response.getCode() >= 500) {
// try again later to fetch robots.txt
cacheRule = false;
robotRules = EMPTY_RULES;
} else
// use default rules
robotRules = EMPTY_RULES;
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
// try again later to fetch robots.txt
cacheRule = false;
robotRules = EMPTY_RULES;
}
}
if (cacheRule) {
// cache rules for host
CACHE.put(cacheKey, robotRules);
if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
// cache also for the redirected host
CACHE.put(getCacheKey(redir), robotRules);
}
}
return robotRules;
}
use of crawlercommons.robots.BaseRobotRules in project nutch by apache.
the class FtpRobotRulesParser method getRobotRulesSet.
/**
* The hosts for which the caching of robots rules is yet to be done, it sends
* a Ftp request to the host corresponding to the {@link URL} passed, gets
* robots file, parses the rules and caches the rules object to avoid re-work
* in future.
*
* @param ftp
* The {@link Protocol} object
* @param url
* URL
* @param robotsTxtContent
* container to store responses when fetching the robots.txt file for
* debugging or archival purposes. Instead of a robots.txt file, it
* may include redirects or an error page (404, etc.). Response
* {@link Content} is appended to the passed list. If null is passed
* nothing is stored.
*
* @return robotRules A {@link BaseRobotRules} object for the rules
*/
@Override
public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url, List<Content> robotsTxtContent) {
// normalize to lower
String protocol = url.getProtocol().toLowerCase();
// case
// normalize to lower case
String host = url.getHost().toLowerCase();
if (LOG.isTraceEnabled() && isWhiteListed(url)) {
LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
}
BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
if (robotRules != null) {
// cached rule
return robotRules;
} else if (LOG.isTraceEnabled()) {
LOG.trace("cache miss " + url);
}
boolean cacheRule = true;
if (isWhiteListed(url)) {
// check in advance whether a host is whitelisted
// (we do not need to fetch robots.txt)
robotRules = EMPTY_RULES;
LOG.info("Whitelisted host found for: {}", url);
LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
} else {
try {
Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, new CrawlDatum());
ProtocolStatus status = output.getStatus();
if (robotsTxtContent != null) {
robotsTxtContent.add(output.getContent());
}
if (status.getCode() == ProtocolStatus.SUCCESS) {
robotRules = parseRules(url.toString(), output.getContent().getContent(), CONTENT_TYPE, agentNames);
} else {
// use default rules
robotRules = EMPTY_RULES;
}
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
// try again later to fetch robots.txt
cacheRule = false;
robotRules = EMPTY_RULES;
}
}
if (cacheRule)
// cache rules for host
CACHE.put(protocol + ":" + host, robotRules);
return robotRules;
}
use of crawlercommons.robots.BaseRobotRules in project nutch by apache.
the class FetcherThread method run.
@SuppressWarnings("fallthrough")
public void run() {
// count threads
activeThreads.incrementAndGet();
Text url = new Text();
FetchItem fit = null;
try {
// checking for the server to be running and fetcher.parse to be true
if (parsing && NutchServer.getInstance().isRunning())
reportToNutchServer = true;
while (true) {
// creating FetchNode for storing in FetchNodeDb
if (reportToNutchServer)
this.fetchNode = new FetchNode();
else
this.fetchNode = null;
// check whether must be stopped
if (isHalted()) {
LOG.debug(getName() + " set to halted");
fit = null;
return;
}
fit = ((FetchItemQueues) fetchQueues).getFetchItem();
if (fit != null) {
URL u = fit.u;
String temp_url = u.getProtocol() + "://" + u.getHost() + ":" + u.getPort() + u.getFile();
url = new Text(temp_url);
}
if (fit == null) {
if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
LOG.debug(getName() + " spin-waiting ...");
// spin-wait.
((AtomicInteger) spinWaiting).incrementAndGet();
try {
Thread.sleep(500);
} catch (Exception e) {
}
((AtomicInteger) spinWaiting).decrementAndGet();
continue;
} else {
// all done, finish this thread
LOG.info(getName() + " " + Thread.currentThread().getId() + " has no more work available");
return;
}
}
lastRequestStart.set(System.currentTimeMillis());
Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
if (reprUrlWritable == null) {
setReprUrl(url.toString());
} else {
setReprUrl(reprUrlWritable.toString());
}
try {
// fetch the page
redirecting = false;
redirectCount = 0;
// Publisher event
if (activatePublisher) {
FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
publisher.publish(startEvent, conf);
}
do {
if (LOG.isInfoEnabled()) {
LOG.info(getName() + " " + Thread.currentThread().getId() + " fetching " + url + " (queue crawl delay=" + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay + "ms)");
}
if (LOG.isDebugEnabled()) {
LOG.debug("redirectCount=" + redirectCount);
}
redirecting = false;
Protocol protocol = this.protocolFactory.getProtocol(url.toString());
BaseRobotRules rules = protocol.getRobotRules(url, fit.datum, robotsTxtContent);
if (robotsTxtContent != null) {
outputRobotsTxt(robotsTxtContent);
robotsTxtContent.clear();
}
if (!rules.isAllowed(fit.u.toString())) {
// unblock
((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
LOG.debug("Denied by robots.txt: " + url);
}
output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
context.getCounter("FetcherStatus", "robots_denied").increment(1);
continue;
}
if (rules.getCrawlDelay() > 0) {
if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
// unblock
((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
LOG.debug("Crawl-Delay for " + url + " too long (" + rules.getCrawlDelay() + "), skipping");
output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
continue;
} else {
FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
fiq.crawlDelay = rules.getCrawlDelay();
if (LOG.isDebugEnabled()) {
LOG.debug("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + url);
}
}
}
ProtocolOutput output = protocol.getProtocolOutput(url, fit.datum);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
ParseStatus pstatus = null;
// unblock queue
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
String urlString = url.toString();
// used for FetchNode
if (fetchNode != null) {
fetchNode.setStatus(status.getCode());
fetchNode.setFetchTime(System.currentTimeMillis());
fetchNode.setUrl(url);
}
// Publish fetch finish event
if (activatePublisher) {
FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
endEvent.addEventData("status", status.getName());
publisher.publish(endEvent, conf);
}
context.getCounter("FetcherStatus", status.getName()).increment(1);
switch(status.getCode()) {
case ProtocolStatus.WOULDBLOCK:
// retry ?
((FetchItemQueues) fetchQueues).addFetchItem(fit);
break;
case // got a page
ProtocolStatus.SUCCESS:
pstatus = output(url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
updateStatus(content.getContent().length);
if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
if (redirUrl != null) {
fit = queueRedirect(redirUrl, fit);
}
}
break;
// redirect
case ProtocolStatus.MOVED:
case ProtocolStatus.TEMP_MOVED:
int code;
boolean temp;
if (status.getCode() == ProtocolStatus.MOVED) {
code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
temp = false;
} else {
code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
temp = true;
}
output(url, fit.datum, content, status, code);
String newUrl = status.getMessage();
Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR);
if (redirUrl != null) {
fit = queueRedirect(redirUrl, fit);
} else {
// stop redirecting
redirecting = false;
}
break;
case ProtocolStatus.EXCEPTION:
logError(url, status.getMessage());
int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID());
if (killedURLs != 0)
context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
// retry
case ProtocolStatus.RETRY:
case ProtocolStatus.BLOCKED:
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
break;
// gone
case ProtocolStatus.GONE:
case ProtocolStatus.NOTFOUND:
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
break;
case ProtocolStatus.NOTMODIFIED:
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
break;
default:
if (LOG.isWarnEnabled()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Unknown ProtocolStatus: " + status.getCode());
}
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
}
if (redirecting && redirectCount > maxRedirect) {
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
if (LOG.isInfoEnabled()) {
LOG.info(getName() + " " + Thread.currentThread().getId() + " - redirect count exceeded " + url);
}
output(url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE);
}
} while (redirecting && (redirectCount <= maxRedirect));
} catch (Throwable t) {
// unexpected exception
// unblock
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
logError(url, StringUtils.stringifyException(t));
output(url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
}
}
} catch (Throwable e) {
if (LOG.isErrorEnabled()) {
LOG.error("fetcher caught:" + e.toString());
}
} finally {
if (fit != null)
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
// count threads
activeThreads.decrementAndGet();
LOG.info(getName() + " " + Thread.currentThread().getId() + " -finishing thread " + getName() + ", activeThreads=" + activeThreads);
}
}
use of crawlercommons.robots.BaseRobotRules in project nutch by apache.
the class RobotRulesParser method run.
@Override
public int run(String[] args) {
if (args.length < 2) {
String[] help = { "Usage: RobotRulesParser [ -Dproperty=... ] <robots-file-or-url> <url-file> [<agent-names>]", "", "<robots-file-or-url>\tlocal file or URL parsed as robots.txt file", "\tIf <robots-file-or-url> starts with a protocol specification", "\t(`http', `https', `ftp' or `file'), robots.txt it is fetched", "\tusing the specified protocol. Otherwise, a local file is assumed.", "", "<url-file>\tlocal file with URLs (one per line), for every URL", "\tthe path part (including the query) is checked whether", "\tit is allowed by the robots.txt rules. Other parts of the URLs", "\t(mainly the host) are ignored.", "", "<agent-names>\tcomma-separated list of agent names", "\tused to select rules from the robots.txt file.", "\tIf no agent name is given the property http.agent.name is used.", "\tIf http.agent.name is empty, robots.txt is checked for rules", "\tassigned to the user agent `*' (meaning any other).", "", "Important properties:", " -D fetcher.store.robotstxt=true", "\toutput content and HTTP meta data of fetched robots.txt (if not a local file)", " -D http.agent.name=...\tsame as argument <agent-names>", " -D http.robots.agents=...\tadditional agent names", " -D http.robot.rules.whitelist=..." };
for (String s : help) {
System.err.println(s);
}
return -1;
}
Protocol protocol = null;
URL robotsTxtUrl = null;
if (args[0].matches("^(?:https?|ftp|file)://?.*")) {
try {
robotsTxtUrl = new URL(args[0]);
} catch (MalformedURLException e) {
LOG.warn("Not a valid URL, assuming local file: {}", args[0]);
}
ProtocolFactory factory = new ProtocolFactory(conf);
try {
protocol = factory.getProtocol(robotsTxtUrl.toString());
} catch (ProtocolNotFound e) {
LOG.error("No protocol found for {}: {}", args[0], StringUtils.stringifyException(e));
return -1;
}
}
if (robotsTxtUrl == null) {
// try as local file
File robotsFile = new File(args[0]);
if (!robotsFile.exists()) {
LOG.error("File does not exist: {}", args[0]);
return -1;
} else {
try {
robotsTxtUrl = robotsFile.toURI().toURL();
} catch (MalformedURLException e) {
}
}
}
File urlFile = new File(args[1]);
if (args.length > 2) {
// set agent name from command-line in configuration and update parser
String agents = args[2];
conf.set("http.agent.name", agents);
setConf(conf);
}
List<Content> robotsTxtContent = null;
if (getConf().getBoolean("fetcher.store.robotstxt", false)) {
robotsTxtContent = new LinkedList<>();
}
try {
BaseRobotRules rules = getRobotRulesSet(protocol, robotsTxtUrl, robotsTxtContent);
if (robotsTxtContent != null) {
for (Content robotsTxt : robotsTxtContent) {
LOG.info("fetched robots.txt {}:", robotsTxt.getUrl());
LOG.info(robotsTxt.toString());
}
}
System.out.println("Testing robots.txt for agent names: " + agentNames);
LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
String testPath;
testPath = testsIn.readLine();
while (testPath != null) {
testPath = testPath.trim();
try {
// testPath can be just a path or a complete URL
URL url = new URL(testPath);
String status;
if (isWhiteListed(url)) {
status = "whitelisted";
} else if (rules.isAllowed(testPath)) {
status = "allowed";
} else {
status = "not allowed";
}
System.out.println(status + ":\t" + testPath);
} catch (MalformedURLException e) {
LOG.warn("Not a valid URL: {}", testPath);
}
testPath = testsIn.readLine();
}
testsIn.close();
} catch (IOException e) {
LOG.error("Failed to run: " + StringUtils.stringifyException(e));
return -1;
}
return 0;
}
Aggregations