use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.
the class HttpBase method main.
protected static void main(HttpBase http, String[] args) throws Exception {
String url = null;
String usage = "Usage: Http [-verbose] [-timeout N] url";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
for (int i = 0; i < args.length; i++) {
// parse command line
if (args[i].equals("-timeout")) {
// found -timeout option
http.timeout = Integer.parseInt(args[++i]) * 1000;
} else if (args[i].equals("-verbose")) {
// found -verbose option
} else if (i != args.length - 1) {
System.err.println(usage);
System.exit(-1);
} else
// root is required parameter
url = args[i];
}
ProtocolOutput out = http.getProtocolOutput(new Text(url), new CrawlDatum());
Content content = out.getContent();
System.out.println("Status: " + out.getStatus());
if (content != null) {
System.out.println("Content Type: " + content.getContentType());
System.out.println("Content Length: " + content.getMetadata().get(Response.CONTENT_LENGTH));
System.out.println("Content:");
String text = new String(content.getContent());
System.out.println(text);
}
}
use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.
the class HttpBase method getProtocolOutput.
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
URL u = new URL(urlString);
long startTime = System.currentTimeMillis();
// make a request
Response response = getResponse(u, datum, false);
if (this.responseTime) {
int elapsedTime = (int) (System.currentTimeMillis() - startTime);
datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
}
int code = response.getCode();
datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text(Integer.toString(code)));
byte[] content = response.getContent();
Content c = new Content(u.toString(), u.toString(), (content == null ? EMPTY_CONTENT : content), response.getHeader("Content-Type"), response.getHeaders(), this.conf);
if (code == 200) {
// return it
return new ProtocolOutput(c);
} else if (code >= 300 && code < 400) {
// handle redirect
String location = response.getHeader("Location");
// some broken servers, such as MS IIS, use lowercase header name...
if (location == null)
location = response.getHeader("location");
if (location == null)
location = "";
u = new URL(u, location);
int protocolStatusCode;
switch(code) {
case // multiple choices, preferred value in Location
300:
protocolStatusCode = ProtocolStatus.MOVED;
break;
// moved permanently
case 301:
case // use proxy (Location is URL of proxy)
305:
protocolStatusCode = ProtocolStatus.MOVED;
break;
// found (temporarily moved)
case 302:
// see other (redirect after POST)
case 303:
case // temporary redirect
307:
protocolStatusCode = ProtocolStatus.TEMP_MOVED;
break;
case // not modified
304:
protocolStatusCode = ProtocolStatus.NOTMODIFIED;
break;
default:
protocolStatusCode = ProtocolStatus.MOVED;
}
// handle this in the higher layer.
return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
} else if (code == 400) {
// bad request, mark as GONE
if (logger.isTraceEnabled()) {
logger.trace("400 Bad request: " + u);
}
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
} else if (code == 401) {
// provided.
if (logger.isTraceEnabled()) {
logger.trace("401 Authentication Required");
}
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: " + urlString));
} else if (code == 404) {
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
} else if (code == 410) {
// permanently GONE
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + u));
} else {
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
}
} catch (Throwable e) {
logger.error("Failed to get protocol output", e);
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.
the class File method getProtocolOutput.
/**
* Creates a {@link FileResponse} object corresponding to the url and return a
* {@link ProtocolOutput} object as per the content received
*
* @param url
* Text containing the url
* @param datum
* The CrawlDatum object corresponding to the url
*
* @return {@link ProtocolOutput} object for the content of the file indicated
* by url
*/
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
URL u = new URL(urlString);
int redirects = 0;
while (true) {
FileResponse response;
// make a
response = new FileResponse(u, datum, this, getConf());
// request
int code = response.getCode();
if (code == 200) {
// return it
return new ProtocolOutput(response.toContent());
} else if (code == 304) {
// got not modified
return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTMODIFIED);
} else if (code == 401) {
// access denied / no read permissions
return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.ACCESS_DENIED));
} else if (code == 404) {
// no such file
return new ProtocolOutput(response.toContent(), ProtocolStatus.STATUS_NOTFOUND);
} else if (code >= 300 && code < 400) {
// handle redirect
u = new URL(response.getHeader("Location"));
if (LOG.isTraceEnabled()) {
LOG.trace("redirect to " + u);
}
if (symlinksAsRedirects) {
return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.MOVED, u));
} else if (redirects == MAX_REDIRECTS) {
LOG.trace("Too many redirects: {}", url);
return new ProtocolOutput(response.toContent(), new ProtocolStatus(ProtocolStatus.REDIR_EXCEEDED, u));
}
redirects++;
} else {
// convert to exception
throw new FileError(code);
}
}
} catch (Exception e) {
e.printStackTrace();
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.
the class TestProtocolFile method setContentType.
/**
* Tests the setting of the <code>Response.CONTENT_TYPE</code> metadata field.
*
* @since NUTCH-384
*/
public void setContentType(String testTextFile) throws ProtocolException {
String urlString = "file:" + sampleDir + fileSeparator + testTextFile;
Assert.assertNotNull(urlString);
Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
ProtocolOutput output = protocol.getProtocolOutput(new Text(urlString), datum);
Assert.assertNotNull(output);
Assert.assertEquals("Status code: [" + output.getStatus().getCode() + "], not equal to: [" + ProtocolStatus.SUCCESS + "]: args: [" + output.getStatus().getArgs() + "]", ProtocolStatus.SUCCESS, output.getStatus().getCode());
Assert.assertNotNull(output.getContent());
Assert.assertNotNull(output.getContent().getContentType());
Assert.assertEquals(expectedMimeType, output.getContent().getContentType());
Assert.assertNotNull(output.getContent().getMetadata());
Assert.assertEquals(expectedMimeType, output.getContent().getMetadata().get(Response.CONTENT_TYPE));
}
use of org.apache.nutch.protocol.ProtocolOutput in project nutch by apache.
the class Ftp method getProtocolOutput.
/**
* Creates a {@link FtpResponse} object corresponding to the url and returns a
* {@link ProtocolOutput} object as per the content received
*
* @param url
* Text containing the ftp url
* @param datum
* The CrawlDatum object corresponding to the url
*
* @return {@link ProtocolOutput} object for the url
*/
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
URL u = new URL(urlString);
int redirects = 0;
while (true) {
FtpResponse response;
// make a request
response = new FtpResponse(u, datum, this, getConf());
int code = response.getCode();
datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text(Integer.toString(code)));
if (code == 200) {
// return it
return new ProtocolOutput(response.toContent());
} else if (code >= 300 && code < 400) {
// handle redirect
if (redirects == MAX_REDIRECTS)
throw new FtpException("Too many redirects: " + url);
String loc = response.getHeader("Location");
try {
u = new URL(u, loc);
} catch (MalformedURLException mue) {
LOG.error("Could not create redirectURL for {} with {}", url, loc);
return new ProtocolOutput(null, new ProtocolStatus(mue));
}
redirects++;
if (LOG.isTraceEnabled()) {
LOG.trace("redirect to " + u);
}
} else {
// convert to exception
throw new FtpError(code);
}
}
} catch (Exception e) {
LOG.error("Could not get protocol output for {}: {}", url, e.getMessage());
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
Aggregations