use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.
the class TestProtocolHttp method fetchPage.
/**
* Fetches the specified <code>page</code> from the local Jetty server and
* checks whether the HTTP response status code matches with the expected
* code. Also use jsp pages for redirection.
*
* @param page
* Page to be fetched.
* @param expectedCode
* HTTP response status code expected while fetching the page.
*/
private void fetchPage(String page, int expectedCode) throws Exception {
URL url = new URL("http", "127.0.0.1", port, page);
CrawlDatum crawlDatum = new CrawlDatum();
Response response = http.getResponse(url, crawlDatum, true);
ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), crawlDatum);
Content content = out.getContent();
assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
if (page.compareTo("/nonexists.html") != 0 && page.compareTo("/brokenpage.jsp") != 0 && page.compareTo("/redirection") != 0) {
assertEquals("ContentType " + url, "text/html", content.getContentType());
}
}
use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.
the class File method main.
/**
* Quick way for running this class. Useful for debugging.
*/
public static void main(String[] args) throws Exception {
int maxContentLength = Integer.MIN_VALUE;
boolean dumpContent = false;
String urlString = null;
String usage = "Usage: File [-maxContentLength L] [-dumpContent] url";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-maxContentLength")) {
maxContentLength = Integer.parseInt(args[++i]);
} else if (args[i].equals("-dumpContent")) {
dumpContent = true;
} else if (i != args.length - 1) {
System.err.println(usage);
System.exit(-1);
} else
urlString = args[i];
}
File file = new File();
file.setConf(NutchConfiguration.create());
if (// set maxContentLength
maxContentLength != Integer.MIN_VALUE)
file.setMaxContentLength(maxContentLength);
// set log level
// LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
ProtocolOutput output = file.getProtocolOutput(new Text(urlString), new CrawlDatum());
Content content = output.getContent();
System.err.println("URL: " + content.getUrl());
System.err.println("Status: " + output.getStatus());
System.err.println("Content-Type: " + content.getContentType());
System.err.println("Content-Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
System.err.println("Last-Modified: " + content.getMetadata().get(Response.LAST_MODIFIED));
String redirectLocation = content.getMetadata().get("Location");
if (redirectLocation != null) {
System.err.println("Location: " + redirectLocation);
}
if (dumpContent) {
System.out.print(new String(content.getContent()));
}
file = null;
}
use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.
the class FtpRobotRulesParser method getRobotRulesSet.
/**
* The hosts for which the caching of robots rules is yet to be done, it sends
* a Ftp request to the host corresponding to the {@link URL} passed, gets
* robots file, parses the rules and caches the rules object to avoid re-work
* in future.
*
* @param ftp
* The {@link Protocol} object
* @param url
* URL
* @param robotsTxtContent
* container to store responses when fetching the robots.txt file for
* debugging or archival purposes. Instead of a robots.txt file, it
* may include redirects or an error page (404, etc.). Response
* {@link Content} is appended to the passed list. If null is passed
* nothing is stored.
*
* @return robotRules A {@link BaseRobotRules} object for the rules
*/
@Override
public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url, List<Content> robotsTxtContent) {
// normalize to lower
String protocol = url.getProtocol().toLowerCase();
// case
// normalize to lower case
String host = url.getHost().toLowerCase();
if (LOG.isTraceEnabled() && isWhiteListed(url)) {
LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
}
BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
if (robotRules != null) {
// cached rule
return robotRules;
} else if (LOG.isTraceEnabled()) {
LOG.trace("cache miss " + url);
}
boolean cacheRule = true;
if (isWhiteListed(url)) {
// check in advance whether a host is whitelisted
// (we do not need to fetch robots.txt)
robotRules = EMPTY_RULES;
LOG.info("Whitelisted host found for: {}", url);
LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
} else {
try {
Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl, new CrawlDatum());
ProtocolStatus status = output.getStatus();
if (robotsTxtContent != null) {
robotsTxtContent.add(output.getContent());
}
if (status.getCode() == ProtocolStatus.SUCCESS) {
robotRules = parseRules(url.toString(), output.getContent().getContent(), CONTENT_TYPE, agentNames);
} else {
// use default rules
robotRules = EMPTY_RULES;
}
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
// try again later to fetch robots.txt
cacheRule = false;
robotRules = EMPTY_RULES;
}
}
if (cacheRule)
// cache rules for host
CACHE.put(protocol + ":" + host, robotRules);
return robotRules;
}
use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.
the class FetcherThread method run.
@SuppressWarnings("fallthrough")
public void run() {
// count threads
activeThreads.incrementAndGet();
Text url = new Text();
FetchItem fit = null;
try {
// checking for the server to be running and fetcher.parse to be true
if (parsing && NutchServer.getInstance().isRunning())
reportToNutchServer = true;
while (true) {
// creating FetchNode for storing in FetchNodeDb
if (reportToNutchServer)
this.fetchNode = new FetchNode();
else
this.fetchNode = null;
// check whether must be stopped
if (isHalted()) {
LOG.debug(getName() + " set to halted");
fit = null;
return;
}
fit = ((FetchItemQueues) fetchQueues).getFetchItem();
if (fit != null) {
URL u = fit.u;
String temp_url = u.getProtocol() + "://" + u.getHost() + ":" + u.getPort() + u.getFile();
url = new Text(temp_url);
}
if (fit == null) {
if (feeder.isAlive() || ((FetchItemQueues) fetchQueues).getTotalSize() > 0) {
LOG.debug(getName() + " spin-waiting ...");
// spin-wait.
((AtomicInteger) spinWaiting).incrementAndGet();
try {
Thread.sleep(500);
} catch (Exception e) {
}
((AtomicInteger) spinWaiting).decrementAndGet();
continue;
} else {
// all done, finish this thread
LOG.info(getName() + " " + Thread.currentThread().getId() + " has no more work available");
return;
}
}
lastRequestStart.set(System.currentTimeMillis());
Text reprUrlWritable = (Text) fit.datum.getMetaData().get(Nutch.WRITABLE_REPR_URL_KEY);
if (reprUrlWritable == null) {
setReprUrl(url.toString());
} else {
setReprUrl(reprUrlWritable.toString());
}
try {
// fetch the page
redirecting = false;
redirectCount = 0;
// Publisher event
if (activatePublisher) {
FetcherThreadEvent startEvent = new FetcherThreadEvent(PublishEventType.START, fit.getUrl().toString());
publisher.publish(startEvent, conf);
}
do {
if (LOG.isInfoEnabled()) {
LOG.info(getName() + " " + Thread.currentThread().getId() + " fetching " + url + " (queue crawl delay=" + ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID).crawlDelay + "ms)");
}
if (LOG.isDebugEnabled()) {
LOG.debug("redirectCount=" + redirectCount);
}
redirecting = false;
Protocol protocol = this.protocolFactory.getProtocol(url.toString());
BaseRobotRules rules = protocol.getRobotRules(url, fit.datum, robotsTxtContent);
if (robotsTxtContent != null) {
outputRobotsTxt(robotsTxtContent);
robotsTxtContent.clear();
}
if (!rules.isAllowed(fit.u.toString())) {
// unblock
((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
if (LOG.isDebugEnabled()) {
LOG.debug("Denied by robots.txt: " + url);
}
output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
context.getCounter("FetcherStatus", "robots_denied").increment(1);
continue;
}
if (rules.getCrawlDelay() > 0) {
if (rules.getCrawlDelay() > maxCrawlDelay && maxCrawlDelay >= 0) {
// unblock
((FetchItemQueues) fetchQueues).finishFetchItem(fit, true);
LOG.debug("Crawl-Delay for " + url + " too long (" + rules.getCrawlDelay() + "), skipping");
output(url, fit.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, CrawlDatum.STATUS_FETCH_GONE);
context.getCounter("FetcherStatus", "robots_denied_maxcrawldelay").increment(1);
continue;
} else {
FetchItemQueue fiq = ((FetchItemQueues) fetchQueues).getFetchItemQueue(fit.queueID);
fiq.crawlDelay = rules.getCrawlDelay();
if (LOG.isDebugEnabled()) {
LOG.debug("Crawl delay for queue: " + fit.queueID + " is set to " + fiq.crawlDelay + " as per robots.txt. url: " + url);
}
}
}
ProtocolOutput output = protocol.getProtocolOutput(url, fit.datum);
ProtocolStatus status = output.getStatus();
Content content = output.getContent();
ParseStatus pstatus = null;
// unblock queue
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
String urlString = url.toString();
// used for FetchNode
if (fetchNode != null) {
fetchNode.setStatus(status.getCode());
fetchNode.setFetchTime(System.currentTimeMillis());
fetchNode.setUrl(url);
}
// Publish fetch finish event
if (activatePublisher) {
FetcherThreadEvent endEvent = new FetcherThreadEvent(PublishEventType.END, fit.getUrl().toString());
endEvent.addEventData("status", status.getName());
publisher.publish(endEvent, conf);
}
context.getCounter("FetcherStatus", status.getName()).increment(1);
switch(status.getCode()) {
case ProtocolStatus.WOULDBLOCK:
// retry ?
((FetchItemQueues) fetchQueues).addFetchItem(fit);
break;
case // got a page
ProtocolStatus.SUCCESS:
pstatus = output(url, fit.datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS, fit.outlinkDepth);
updateStatus(content.getContent().length);
if (pstatus != null && pstatus.isSuccess() && pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) {
String newUrl = pstatus.getMessage();
int refreshTime = Integer.valueOf(pstatus.getArgs()[1]);
Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME, Fetcher.CONTENT_REDIR);
if (redirUrl != null) {
fit = queueRedirect(redirUrl, fit);
}
}
break;
// redirect
case ProtocolStatus.MOVED:
case ProtocolStatus.TEMP_MOVED:
int code;
boolean temp;
if (status.getCode() == ProtocolStatus.MOVED) {
code = CrawlDatum.STATUS_FETCH_REDIR_PERM;
temp = false;
} else {
code = CrawlDatum.STATUS_FETCH_REDIR_TEMP;
temp = true;
}
output(url, fit.datum, content, status, code);
String newUrl = status.getMessage();
Text redirUrl = handleRedirect(url, fit.datum, urlString, newUrl, temp, Fetcher.PROTOCOL_REDIR);
if (redirUrl != null) {
fit = queueRedirect(redirUrl, fit);
} else {
// stop redirecting
redirecting = false;
}
break;
case ProtocolStatus.EXCEPTION:
logError(url, status.getMessage());
int killedURLs = ((FetchItemQueues) fetchQueues).checkExceptionThreshold(fit.getQueueID());
if (killedURLs != 0)
context.getCounter("FetcherStatus", "AboveExceptionThresholdInQueue").increment(killedURLs);
// retry
case ProtocolStatus.RETRY:
case ProtocolStatus.BLOCKED:
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
break;
// gone
case ProtocolStatus.GONE:
case ProtocolStatus.NOTFOUND:
case ProtocolStatus.ACCESS_DENIED:
case ProtocolStatus.ROBOTS_DENIED:
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_GONE);
break;
case ProtocolStatus.NOTMODIFIED:
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_NOTMODIFIED);
break;
default:
if (LOG.isWarnEnabled()) {
LOG.warn(getName() + " " + Thread.currentThread().getId() + " Unknown ProtocolStatus: " + status.getCode());
}
output(url, fit.datum, null, status, CrawlDatum.STATUS_FETCH_RETRY);
}
if (redirecting && redirectCount > maxRedirect) {
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
if (LOG.isInfoEnabled()) {
LOG.info(getName() + " " + Thread.currentThread().getId() + " - redirect count exceeded " + url);
}
output(url, fit.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, CrawlDatum.STATUS_FETCH_GONE);
}
} while (redirecting && (redirectCount <= maxRedirect));
} catch (Throwable t) {
// unexpected exception
// unblock
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
logError(url, StringUtils.stringifyException(t));
output(url, fit.datum, null, ProtocolStatus.STATUS_FAILED, CrawlDatum.STATUS_FETCH_RETRY);
}
}
} catch (Throwable e) {
if (LOG.isErrorEnabled()) {
LOG.error("fetcher caught:" + e.toString());
}
} finally {
if (fit != null)
((FetchItemQueues) fetchQueues).finishFetchItem(fit);
// count threads
activeThreads.decrementAndGet();
LOG.info(getName() + " " + Thread.currentThread().getId() + " -finishing thread " + getName() + ", activeThreads=" + activeThreads);
}
}
use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.
the class ParserChecker method run.
public int run(String[] args) throws Exception {
boolean dumpText = false;
boolean force = false;
String contentType = null;
String url = null;
String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] [-md key=value] url";
if (args.length == 0) {
LOG.error(usage);
return (-1);
}
// used to simulate the metadata propagated from injection
HashMap<String, String> metadata = new HashMap<>();
for (int i = 0; i < args.length; i++) {
if (args[i].equals("-forceAs")) {
force = true;
contentType = args[++i];
} else if (args[i].equals("-dumpText")) {
dumpText = true;
} else if (args[i].equals("-md")) {
String k = null, v = null;
String nextOne = args[++i];
int firstEquals = nextOne.indexOf("=");
if (firstEquals != -1) {
k = nextOne.substring(0, firstEquals);
v = nextOne.substring(firstEquals + 1);
} else
k = nextOne;
metadata.put(k, v);
} else if (i != args.length - 1) {
LOG.error(usage);
System.exit(-1);
} else {
url = URLUtil.toASCII(args[i]);
}
}
if (LOG.isInfoEnabled()) {
LOG.info("fetching: " + url);
}
CrawlDatum cd = new CrawlDatum();
Iterator<String> iter = metadata.keySet().iterator();
while (iter.hasNext()) {
String key = iter.next();
String value = metadata.get(key);
if (value == null)
value = "";
cd.getMetaData().put(new Text(key), new Text(value));
}
ProtocolFactory factory = new ProtocolFactory(conf);
Protocol protocol = factory.getProtocol(url);
Text turl = new Text(url);
ProtocolOutput output = protocol.getProtocolOutput(turl, cd);
// if the configuration permits, handle redirects until we either run
// out of allowed redirects or we stop getting redirect statuses.
int maxRedirects = conf.getInt("http.redirect.max", 0);
int numRedirects = 0;
while (output.getStatus().isRedirect() && numRedirects < maxRedirects) {
String newURL = URLUtil.toASCII(output.getStatus().getArgs()[0]);
LOG.info("Handling redirect to " + newURL);
protocol = factory.getProtocol(newURL);
turl = new Text(newURL);
output = protocol.getProtocolOutput(turl, cd);
numRedirects++;
}
if (!output.getStatus().isSuccess()) {
System.err.println("Fetch failed with protocol status: " + output.getStatus());
if (output.getStatus().isRedirect()) {
System.err.println("Redirect(s) not handled due to configuration.");
System.err.println("Max Redirects to handle per config: " + maxRedirects);
System.err.println("Number of Redirects handled: " + numRedirects);
}
return (-1);
}
Content content = output.getContent();
if (content == null) {
LOG.error("No content for " + url);
return (-1);
}
if (force) {
content.setContentType(contentType);
} else {
contentType = content.getContentType();
}
if (contentType == null) {
LOG.error("Failed to determine content type!");
return (-1);
}
if (ParseSegment.isTruncated(content)) {
LOG.warn("Content is truncated, parse may fail!");
}
ScoringFilters scfilters = new ScoringFilters(conf);
// call the scoring filters
try {
scfilters.passScoreBeforeParsing(turl, cd, content);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't pass score before parsing, url " + turl + " (" + e + ")");
LOG.warn(StringUtils.stringifyException(e));
}
}
ParseResult parseResult = new ParseUtil(conf).parse(content);
if (parseResult == null) {
LOG.error("Parsing content failed!");
return (-1);
}
// calculate the signature
byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new Text(url)));
if (LOG.isInfoEnabled()) {
LOG.info("parsing: " + url);
LOG.info("contentType: " + contentType);
LOG.info("signature: " + StringUtil.toHexString(signature));
}
Parse parse = parseResult.get(turl);
if (parse == null) {
LOG.error("Failed to get parse from parse result");
LOG.error("Available parses in parse result (by URL key):");
for (Map.Entry<Text, Parse> entry : parseResult) {
LOG.error(" " + entry.getKey());
}
LOG.error("Parse result does not contain a parse for URL to be checked:");
LOG.error(" " + turl);
return -1;
}
// call the scoring filters
try {
scfilters.passScoreAfterParsing(turl, content, parse);
} catch (Exception e) {
if (LOG.isWarnEnabled()) {
LOG.warn("Couldn't pass score after parsing, url " + turl + " (" + e + ")");
LOG.warn(StringUtils.stringifyException(e));
}
}
for (Map.Entry<Text, Parse> entry : parseResult) {
parse = entry.getValue();
LOG.info("---------\nUrl\n---------------\n");
System.out.print(entry.getKey());
LOG.info("\n---------\nParseData\n---------\n");
System.out.print(parse.getData().toString());
if (dumpText) {
LOG.info("---------\nParseText\n---------\n");
System.out.print(parse.getText());
}
}
return 0;
}
Aggregations