use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.
the class HttpBase method getProtocolOutput.
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
URL u = new URL(urlString);
long startTime = System.currentTimeMillis();
// make a request
Response response = getResponse(u, datum, false);
if (this.responseTime) {
int elapsedTime = (int) (System.currentTimeMillis() - startTime);
datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
}
int code = response.getCode();
datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text(Integer.toString(code)));
byte[] content = response.getContent();
Content c = new Content(u.toString(), u.toString(), (content == null ? EMPTY_CONTENT : content), response.getHeader("Content-Type"), response.getHeaders(), this.conf);
if (code == 200) {
// return it
return new ProtocolOutput(c);
} else if (code >= 300 && code < 400) {
// handle redirect
String location = response.getHeader("Location");
// some broken servers, such as MS IIS, use lowercase header name...
if (location == null)
location = response.getHeader("location");
if (location == null)
location = "";
u = new URL(u, location);
int protocolStatusCode;
switch(code) {
case // multiple choices, preferred value in Location
300:
protocolStatusCode = ProtocolStatus.MOVED;
break;
// moved permanently
case 301:
case // use proxy (Location is URL of proxy)
305:
protocolStatusCode = ProtocolStatus.MOVED;
break;
// found (temporarily moved)
case 302:
// see other (redirect after POST)
case 303:
case // temporary redirect
307:
protocolStatusCode = ProtocolStatus.TEMP_MOVED;
break;
case // not modified
304:
protocolStatusCode = ProtocolStatus.NOTMODIFIED;
break;
default:
protocolStatusCode = ProtocolStatus.MOVED;
}
// handle this in the higher layer.
return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
} else if (code == 400) {
// bad request, mark as GONE
if (logger.isTraceEnabled()) {
logger.trace("400 Bad request: " + u);
}
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
} else if (code == 401) {
// provided.
if (logger.isTraceEnabled()) {
logger.trace("401 Authentication Required");
}
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: " + urlString));
} else if (code == 404) {
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
} else if (code == 410) {
// permanently GONE
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + u));
} else {
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
}
} catch (Throwable e) {
logger.error("Failed to get protocol output", e);
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.
the class File method getProtocolOutput.
/**
* Creates a {@link FileResponse} object corresponding to the url and return a
* {@link ProtocolOutput} object as per the content received
*
* @param url
* Text containing the url
* @param datum
* The CrawlDatum object corresponding to the url
*
* @return {@link ProtocolOutput} object for the content of the file indicated
* by url
*/
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
URL u = new URL(urlString);
int redirects = 0;
while (true) {
FileResponse response;
// make a
response = new FileResponse(u, datum, this, getConf());
// request
int code = response.getCode();
if (code == 200) {
// return it
return new ProtocolOutput(response.toContent());
} else if (code == 304) {
// got not modified
return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTMODIFIED);
} else if (code == 401) {
// access denied / no read permissions
return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.ACCESS_DENIED));
} else if (code == 404) {
// no such file
return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTFOUND);
} else if (code >= 300 && code < 400) {
// handle redirect
u = new URL(response.getHeader("Location"));
if (LOG.isTraceEnabled()) {
LOG.trace("redirect to " + u);
}
if (symlinksAsRedirects) {
return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.MOVED, u));
} else if (redirects == MAX_REDIRECTS) {
LOG.trace("Too many redirects: {}", url);
return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.REDIR_EXCEEDED, u));
}
redirects++;
} else {
// convert to exception
throw new FileError(code);
}
}
} catch (Exception e) {
e.printStackTrace();
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
use of org.apache.nutch.protocol.ProtocolStatus in project nutch by apache.
the class Ftp method getProtocolOutput.
/**
* Creates a {@link FtpResponse} object corresponding to the url and returns a
* {@link ProtocolOutput} object as per the content received
*
* @param url
* Text containing the ftp url
* @param datum
* The CrawlDatum object corresponding to the url
*
* @return {@link ProtocolOutput} object for the url
*/
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
URL u = new URL(urlString);
int redirects = 0;
while (true) {
FtpResponse response;
// make a request
response = new FtpResponse(u, datum, this, getConf());
int code = response.getCode();
datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text(Integer.toString(code)));
if (code == 200) {
// return it
return new ProtocolOutput(response.toContent());
} else if (code >= 300 && code < 400) {
// handle redirect
if (redirects == MAX_REDIRECTS)
throw new FtpException("Too many redirects: " + url);
String loc = response.getHeader("Location");
try {
u = new URL(u, loc);
} catch (MalformedURLException mue) {
LOG.error("Could not create redirectURL for {} with {}", url, loc);
return new ProtocolOutput(null, new ProtocolStatus(mue));
}
redirects++;
if (LOG.isTraceEnabled()) {
LOG.trace("redirect to " + u);
}
} else {
// convert to exception
throw new FtpError(code);
}
}
} catch (Exception e) {
LOG.error("Could not get protocol output for {}: {}", url, e.getMessage());
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
Aggregations