use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.
the class CrawlDatum method evaluate.
public boolean evaluate(Expression expr, String url) {
if (expr != null && url != null) {
// Create a context and add data
JexlContext jcontext = new MapContext();
// https://issues.apache.org/jira/browse/NUTCH-2229
jcontext.set("url", url);
jcontext.set("status", getStatusName(getStatus()));
jcontext.set("fetchTime", (long) (getFetchTime()));
jcontext.set("modifiedTime", (long) (getModifiedTime()));
jcontext.set("retries", getRetriesSinceFetch());
jcontext.set("interval", new Integer(getFetchInterval()));
jcontext.set("score", getScore());
jcontext.set("signature", StringUtil.toHexString(getSignature()));
// Set metadata variables
for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
Object value = entry.getValue();
Text tkey = (Text) entry.getKey();
if (value instanceof FloatWritable) {
FloatWritable fvalue = (FloatWritable) value;
jcontext.set(tkey.toString(), fvalue.get());
}
if (value instanceof IntWritable) {
IntWritable ivalue = (IntWritable) value;
jcontext.set(tkey.toString(), ivalue.get());
}
if (value instanceof Text) {
Text tvalue = (Text) value;
jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
}
if (value instanceof ProtocolStatus) {
ProtocolStatus pvalue = (ProtocolStatus) value;
jcontext.set(tkey.toString().replace("-", "_"), pvalue.toString());
}
}
try {
if (Boolean.TRUE.equals(expr.evaluate(jcontext))) {
return true;
}
} catch (Exception e) {
//
}
}
return false;
}
use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.
the class UpdateHostDbMapper method map.
/**
* Mapper ingesting records from the HostDB, CrawlDB and plaintext host
* scores file. Statistics and scores are passed on.
*
* @param key record {@link org.apache.hadoop.io.Text} key
* @param value associated {@link org.apache.hadoop.io.Writable} object
* @param context {@link org.apache.hadoop.mapreduce.Reducer.Context} for
* writing custom counters and output.
*/
@Override
public void map(Text key, Writable value, Context context) throws IOException, InterruptedException {
// Get the key!
String keyStr = key.toString();
// Check if we process records from the CrawlDB
if (key instanceof Text && value instanceof CrawlDatum) {
// Get the normalized and filtered host of this URL
buffer = filterNormalize(URLUtil.getHost(keyStr));
// Filtered out?
if (buffer == null) {
context.getCounter("UpdateHostDb", "filtered_records").increment(1);
LOG.info("UpdateHostDb: " + URLUtil.getHost(keyStr) + " crawldatum has been filtered");
return;
}
// Set the host of this URL
host.set(buffer);
crawlDatum = (CrawlDatum) value;
hostDatum = new HostDatum();
// Do not resolve homepages when the root URL is unfetched
if (crawlDatum.getStatus() != CrawlDatum.STATUS_DB_UNFETCHED) {
// Get the protocol
String protocol = URLUtil.getProtocol(keyStr);
// Get the proposed homepage URL
String homepage = protocol + "://" + buffer + "/";
// Check if the current key is equals the host
if (keyStr.equals(homepage)) {
// Check if this is a redirect to the real home page
if (crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_PERM || crawlDatum.getStatus() == CrawlDatum.STATUS_DB_REDIR_TEMP) {
// Obtain the repr url for this redirect via protocolstatus from the metadata
ProtocolStatus z = (ProtocolStatus) crawlDatum.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
// Get the protocol status' arguments
args = z.getArgs();
// ..and the possible redirect URL
reprUrl = args[0];
// Am i a redirect?
if (reprUrl != null) {
LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0]);
context.write(host, new NutchWritable(hostDatum));
hostDatum.setHomepageUrl(reprUrl);
} else {
LOG.info("UpdateHostDb: homepage: " + keyStr + " redirects to: " + args[0] + " but has been filtered out");
}
} else {
hostDatum.setHomepageUrl(homepage);
context.write(host, new NutchWritable(hostDatum));
LOG.info("UpdateHostDb: homepage: " + homepage);
}
}
}
// Always emit crawl datum
context.write(host, new NutchWritable(crawlDatum));
}
// Check if we got a record from the hostdb
if (key instanceof Text && value instanceof HostDatum) {
buffer = filterNormalize(keyStr);
// Filtered out?
if (buffer == null) {
context.getCounter("UpdateHostDb", "filtered_records").increment(1);
LOG.info("UpdateHostDb: {} hostdatum has been filtered", keyStr);
return;
}
// Get a HostDatum
hostDatum = (HostDatum) value;
key.set(buffer);
// we're aggregating them from CrawlDB anyway
if (readingCrawlDb) {
hostDatum.resetStatistics();
}
context.write(key, new NutchWritable(hostDatum));
}
// Check if we got a record with host scores
if (key instanceof Text && value instanceof Text) {
buffer = filterNormalize(keyStr);
// Filtered out?
if (buffer == null) {
context.getCounter("UpdateHostDb", "filtered_records").increment(1);
LOG.info("UpdateHostDb: {} score has been filtered", keyStr);
return;
}
key.set(buffer);
context.write(key, new NutchWritable(new FloatWritable(Float.parseFloat(value.toString()))));
}
}
use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.
the class FetcherThread method run.
@SuppressWarnings("fallthrough")
public void run() {
// count threads
activeThreads.incrementAndGet();
FetchItem fit = null;
try {
// checking for the server to be running and fetcher.parse to be true
if (parsing && NutchServer.getInstance().isRunning())
reportToNutchServer = true;
while (true) {
// creating FetchNode for storing in FetchNodeDb
if (reportToNutchServer)
this.fetchNode = new FetchNode();
else
this.fetchNode = null;
// check whether must be stopped
if (isHalted()) {
LOG.debug("{} set to halted", getName());
fit = null;
return;
}
fit = fetchQueues.getFetchItem();
if (fit == null) {
if (feeder.isAlive() || fetchQueues.getTotalSize() > 0) {
LOG.debug("{} spin-waiting ...", getName());
// spin-wait.
spinWaiting.incrementAndGet();
try {
Thread.sleep(500);
} catch (Exception e) {
}
spinWaiting.decrementAndGet();
continue;
} else {
// all done, finish this thread
LOG.info("{} {} has no more work available", getName(), Thread.currentThread().getId());
return;
}
}
lastRequestStart.set(System.currentTimeMillis());
Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
if (reprUrlWritable == null) {
setReprUrl(fit.url.toString());
} else {
setReprUrl(reprUrlWritable.toString());
}
try {
// fetch the page
redirecting = false;
redirectCount = 0;
// Publisher event
if (activatePublisher) {
FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
publisher.publish(startEvent, conf);
}
do {
if (LOG.isInfoEnabled()) {
LOG.info("{} {} fetching {} (queue crawl delay={}ms)", getName(), Thread.currentThread().getId(), fit.url, fetchQueues.getFetchItemQueue(fit.queueID).crawlDelay);
}
LOG.debug("redirectCount={}", redirectCount);
redirecting = false;
Protocol protocol = this.protocolFactory.getProtocol(fit.u);
BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum, robotsTxtContent);
if (robotsTxtContent != null) {
outputRobotsTxt(robotsTxtContent);
robotsTxtContent.clear();
}
if (rules.isDeferVisits()) {
LOG.info("Defer visits for queue {} : {}", fit.queueID, fit.url);
// retry the fetch item
if (fetchQueues.timelimitExceeded()) {
fetchQueues.finishFetchItem(fit, true);
} else {
fetchQueues.addFetchItem(fit);
}
// but check whether it's time to cancel the queue
int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID(), this.robotsDeferVisitsRetries + 1, this.robotsDeferVisitsDelay);
if (killedURLs != 0) {
context.getCounter("FetcherStatus", "robots_defer_visits_dropped").increment(killedURLs);
}
continue;
}
if (!rules.isAllowed(fit.url.toString())) {
// unblock
fetchQueues.finishFetchItem(fit, true);
LOG.info("Denied by robots.txt: {}", fit.url);
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
context.getCounter("FetcherStatus", "robots_denied").increment(1);
continue;
}
if (rules.getCrawlDelay() > 0) {
if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
// unblock
fetchQueues.finishFetchItem(fit, true);
LOG.info("Crawl-Delay for {} too long ({} ms), skipping", fit.url, rules.getCrawlDelay());
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
continue;
} else {
FetchItemQueue fiq = fetchQueues.getFetchItemQueue(fit.queueID);
long crawlDelay = rules.getCrawlDelay();
if (crawlDelay < minCrawlDelay) {
LOG.info("Crawl-Delay for {} too short ({} ms), adjusting to {} ms", fit.url, rules.getCrawlDelay(), minCrawlDelay);
crawlDelay = minCrawlDelay;
}
fiq.crawlDelay = crawlDelay;
LOG.debug("Crawl delay for queue: {} is set to {} as per robots.txt. url: ", fit.queueID, fiq.crawlDelay, fit.url);
}
}
ProtocolOutput output = protocol.getProtocolOutput(fit.url, fit.datum);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
ParseStatus pstatus = null;
// unblock queue
fetchQueues.finishFetchItem(fit);
// used for FetchNode
if (fetchNode != null) {
fetchNode.setStatus(status.getCode());
fetchNode.setFetchTime(System.currentTimeMillis());
fetchNode.setUrl(fit.url);
}
// Publish fetch finish event
if (activatePublisher) {
FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
endEvent.addEventData("status", status.getName());
publisher.publish(endEvent, conf);
}
context.getCounter("FetcherStatus", status.getName()).increment(1);
switch(status.getCode()) {
case ProtocolStatus.WOULDBLOCK:
// retry ?
fetchQueues.addFetchItem(fit);
break;
case // got a page
ProtocolStatus.SUCCESS:
pstatus = output(fit.url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
updateStatus(content.getContent().length);
if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
int refreshTime = Integer.parseInt(pstatus.getArgs()[1]);
Text redirUrl = handleRedirect(fit, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
if (redirUrl != null) {
fit = queueRedirect(redirUrl, fit);
}
}
break;
// redirect
case ProtocolStatus.MOVED:
case ProtocolStatus.TEMP_MOVED:
int code;
boolean temp;
if (status.getCode() == ProtocolStatus.MOVED) {
code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
temp = false;
} else {
code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
temp = true;
}
output(fit.url, fit.datum, content, status, code);
String newUrl = status.getMessage();
Text redirUrl = handleRedirect(fit, newUrl, temp, Fetcher.PROTOCOL_REDIR);
if (redirUrl != null) {
fit = queueRedirect(redirUrl, fit);
} else {
// stop redirecting
redirecting = false;
}
break;
case ProtocolStatus.EXCEPTION:
logError(fit.url, status.getMessage());
int killedURLs = fetchQueues.checkExceptionThreshold(fit.getQueueID());
if (killedURLs != 0)
context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
// retry
case ProtocolStatus.RETRY:
case ProtocolStatus.BLOCKED:
output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
break;
// gone
case ProtocolStatus.GONE:
case ProtocolStatus.NOTFOUND:
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
break;
case ProtocolStatus.NOTMODIFIED:
output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
break;
default:
if (LOG.isWarnEnabled()) {
LOG.warn("{} {} Unknown ProtocolStatus: {}", getName(), Thread.currentThread().getId(), status.getCode());
}
output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
}
if (redirecting && redirectCount > maxRedirect) {
fetchQueues.finishFetchItem(fit);
context.getCounter("FetcherStatus", "redirect_count_exceeded").increment(1);
if (LOG.isInfoEnabled()) {
LOG.info("{} {} - redirect count exceeded {} ({})", getName(), Thread.currentThread().getId(), fit.url, maxRedirectExceededSkip ? "skipped" : "linked");
}
if (maxRedirectExceededSkip) {
// skip redirect target when redirect count is exceeded
} else {
Text newUrl = new Text(status.getMessage());
CrawlDatum newDatum = createRedirDatum(newUrl, fit, CrawlDatum.STATUS_LINKED);
output(newUrl, newDatum, null, null, CrawlDatum.STATUS_LINKED);
}
}
} while (redirecting && (redirectCount <= maxRedirect));
} catch (Throwable t) {
// unexpected exception
// unblock
fetchQueues.finishFetchItem(fit);
String message;
if (LOG.isDebugEnabled()) {
message = StringUtils.stringifyException(t);
} else if (logUtil.logShort(t)) {
message = t.getClass().getName();
} else {
message = StringUtils.stringifyException(t);
}
logError(fit.url, message);
output(fit.url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
}
}
} catch (Throwable e) {
if (LOG.isErrorEnabled()) {
LOG.error("fetcher caught:", e);
}
} finally {
if (fit != null) {
fetchQueues.finishFetchItem(fit);
}
// count threads
activeThreads.decrementAndGet();
LOG.info("{} {} -finishing thread {}, activeThreads={}", getName(), Thread.currentThread().getId(), getName(), activeThreads);
}
}
use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.
the class SegmentHandler method handle.
@Override
public void handle(Request req, HttpServletResponse res, String target, int dispatch) throws IOException, ServletException {
try {
String uri = req.getUri().toString();
LOG.info("URI: " + uri);
addMyHeader(res, "URI", uri);
Text url = new Text(uri.toString());
CrawlDatum cd = seg.getCrawlDatum(url);
if (cd != null) {
addMyHeader(res, "Res", "found");
LOG.info("-got " + cd.toString());
ProtocolStatus ps = (ProtocolStatus) cd.getMetaData().get(Nutch.WRITABLE_PROTO_STATUS_KEY);
if (ps != null) {
Integer TrCode = protoCodes.get(ps.getCode());
if (TrCode != null) {
res.setStatus(TrCode.intValue());
} else {
res.setStatus(HttpServletResponse.SC_OK);
}
addMyHeader(res, "ProtocolStatus", ps.toString());
} else {
res.setStatus(HttpServletResponse.SC_OK);
}
Content c = seg.getContent(url);
if (c == null) {
// missing content
req.setHandled(true);
res.addHeader("X-Handled-By", getClass().getSimpleName());
return;
}
byte[] data = c.getContent();
LOG.debug("-data len=" + data.length);
Metadata meta = c.getMetadata();
String[] names = meta.names();
LOG.debug("- " + names.length + " meta");
for (int i = 0; i < names.length; i++) {
boolean my = true;
char ch = names[i].charAt(0);
if (Character.isLetter(ch) && Character.isUpperCase(ch)) {
// pretty good chance it's a standard header
my = false;
}
String[] values = meta.getValues(names[i]);
for (int k = 0; k < values.length; k++) {
if (my) {
addMyHeader(res, names[i], values[k]);
} else {
res.addHeader(names[i], values[k]);
}
}
}
req.setHandled(true);
res.addHeader("X-Handled-By", getClass().getSimpleName());
res.setContentType(meta.get(Metadata.CONTENT_TYPE));
res.setContentLength(data.length);
OutputStream os = res.getOutputStream();
os.write(data, 0, data.length);
res.flushBuffer();
} else {
addMyHeader(res, "Res", "not found");
LOG.info(" -not found " + url);
}
} catch (Exception e) {
e.printStackTrace();
LOG.warn(StringUtils.stringifyException(e));
addMyHeader(res, "Res", "Exception: " + StringUtils.stringifyException(e));
}
}
use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.
the class CrawlDatum method execute.
public boolean execute(JexlScript expr, String url) {
if (expr != null && url != null) {
// Create a context and add data
JexlContext jcontext = new MapContext();
// https://issues.apache.org/jira/browse/NUTCH-2229
jcontext.set("url", url);
jcontext.set("status", getStatusName(getStatus()));
jcontext.set("fetchTime", (long) (getFetchTime()));
jcontext.set("modifiedTime", (long) (getModifiedTime()));
jcontext.set("retries", getRetriesSinceFetch());
jcontext.set("interval", Integer.valueOf(getFetchInterval()));
jcontext.set("score", getScore());
jcontext.set("signature", StringUtil.toHexString(getSignature()));
// Set metadata variables
for (Map.Entry<Writable, Writable> entry : getMetaData().entrySet()) {
Object value = entry.getValue();
Text tkey = (Text) entry.getKey();
if (value instanceof FloatWritable) {
FloatWritable fvalue = (FloatWritable) value;
jcontext.set(tkey.toString(), fvalue.get());
}
if (value instanceof IntWritable) {
IntWritable ivalue = (IntWritable) value;
jcontext.set(tkey.toString(), ivalue.get());
}
if (value instanceof Text) {
Text tvalue = (Text) value;
jcontext.set(tkey.toString().replace("-", "_"), tvalue.toString());
}
if (value instanceof ProtocolStatus) {
ProtocolStatus pvalue = (ProtocolStatus) value;
jcontext.set(tkey.toString().replace("-", "_"), pvalue.toString());
}
}
try {
if (Boolean.TRUE.equals(expr.execute(jcontext))) {
return true;
}
} catch (Exception e) {
//
}
}
return false;
}
Aggregations