use of org.apache.nutch.net.protocols.Response in project nutch by apache.
the class TestProtocolHttp method fetchPage.
/**
* Fetches the specified <code>page</code> from the local Jetty server and
* checks whether the HTTP response status code matches with the expected
* code. Also use jsp pages for redirection.
*
* @param page
* Page to be fetched.
* @param expectedCode
* HTTP response status code expected while fetching the page.
*/
private void fetchPage(String page, int expectedCode) throws Exception {
URL url = new URL("http", "127.0.0.1", port, page);
CrawlDatum crawlDatum = new CrawlDatum();
Response response = http.getResponse(url, crawlDatum, true);
ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), crawlDatum);
Content content = out.getContent();
assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
if (page.compareTo("/nonexists.html") != 0 && page.compareTo("/brokenpage.jsp") != 0 && page.compareTo("/redirection") != 0) {
assertEquals("ContentType " + url, "text/html", content.getContentType());
}
}
use of org.apache.nutch.net.protocols.Response in project nutch by apache.
the class TestProtocolHttpClient method fetchPage.
/**
* Fetches the specified <code>page</code> from the local Jetty server and
* checks whether the HTTP response status code matches with the expected
* code.
*
* @param page
* Page to be fetched.
* @param expectedCode
* HTTP response status code expected while fetching the page.
* @throws Exception
* When an error occurs or test case fails.
*/
private void fetchPage(String page, int expectedCode) throws Exception {
URL url = new URL("http", "127.0.0.1", port, page);
Response response = null;
response = http.getResponse(url, new CrawlDatum(), true);
int code = response.getCode();
Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code);
}
use of org.apache.nutch.net.protocols.Response in project nutch by apache.
the class HttpRobotRulesParser method getRobotRulesSet.
/**
* Get the rules from robots.txt which applies for the given {@code url}.
* Robot rules are cached for a unique combination of host, protocol, and
* port. If no rules are found in the cache, a HTTP request is send to fetch
* {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
* rules are cached to avoid re-fetching and re-parsing it again.
*
* @param http
* The {@link Protocol} object
* @param url
* URL
* @param robotsTxtContent
* container to store responses when fetching the robots.txt file for
* debugging or archival purposes. Instead of a robots.txt file, it
* may include redirects or an error page (404, etc.). Response
* {@link Content} is appended to the passed list. If null is passed
* nothing is stored.
*
* @return robotRules A {@link BaseRobotRules} object for the rules
*/
@Override
public BaseRobotRules getRobotRulesSet(Protocol http, URL url, List<Content> robotsTxtContent) {
if (LOG.isTraceEnabled() && isAllowListed(url)) {
LOG.trace("Ignoring robots.txt (host is allowlisted) for URL: {}", url);
}
String cacheKey = getCacheKey(url);
BaseRobotRules robotRules = CACHE.get(cacheKey);
if (robotRules != null) {
// cached rule
return robotRules;
} else if (LOG.isTraceEnabled()) {
LOG.trace("cache miss {}", url);
}
boolean cacheRule = true;
URL redir = null;
if (isAllowListed(url)) {
// check in advance whether a host is allowlisted
// (we do not need to fetch robots.txt)
robotRules = EMPTY_RULES;
LOG.info("Allowlisted host found for: {}", url);
LOG.info("Ignoring robots.txt for all URLs from allowlisted host: {}", url.getHost());
} else {
try {
URL robotsUrl = new URL(url, "/robots.txt");
Response response = ((HttpBase) http).getResponse(robotsUrl, new CrawlDatum(), true);
if (robotsTxtContent != null) {
addRobotsContent(robotsTxtContent, robotsUrl, response);
}
// try one level of redirection ?
if (response.getCode() == 301 || response.getCode() == 302) {
String redirection = response.getHeader("Location");
if (redirection == null) {
// some versions of MS IIS are known to mangle this header
redirection = response.getHeader("location");
}
if (redirection != null) {
if (!redirection.startsWith("http")) {
// RFC says it should be absolute, but apparently it isn't
redir = new URL(url, redirection);
} else {
redir = new URL(redirection);
}
response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), true);
if (robotsTxtContent != null) {
addRobotsContent(robotsTxtContent, redir, response);
}
}
}
if (// found rules: parse them
response.getCode() == 200)
robotRules = parseRules(url.toString(), response.getContent(), response.getHeader("Content-Type"), agentNames);
else if ((response.getCode() == 403) && (!allowForbidden))
// use forbid all
robotRules = FORBID_ALL_RULES;
else if (response.getCode() >= 500) {
// try again later to fetch robots.txt
cacheRule = false;
if (deferVisits503) {
// signal fetcher to suspend crawling for this host
robotRules = DEFER_VISIT_RULES;
} else {
robotRules = EMPTY_RULES;
}
} else {
// use default rules
robotRules = EMPTY_RULES;
}
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
// try again later to fetch robots.txt
cacheRule = false;
robotRules = EMPTY_RULES;
}
}
if (cacheRule) {
// cache rules for host
CACHE.put(cacheKey, robotRules);
if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost()) && "/robots.txt".equals(redir.getFile())) {
// cache also for the redirected host
// if the URL path is /robots.txt
CACHE.put(getCacheKey(redir), robotRules);
}
}
return robotRules;
}
use of org.apache.nutch.net.protocols.Response in project nutch by apache.
the class OkHttp method setConf.
@Override
public void setConf(Configuration conf) {
super.setConf(conf);
// protocols in order of preference
List<okhttp3.Protocol> protocols = new ArrayList<>();
if (useHttp2) {
protocols.add(okhttp3.Protocol.HTTP_2);
}
protocols.add(okhttp3.Protocol.HTTP_1_1);
okhttp3.OkHttpClient.Builder builder = new OkHttpClient.Builder().protocols(//
protocols).retryOnConnectionFailure(//
true).followRedirects(//
false).connectTimeout(timeout, TimeUnit.MILLISECONDS).writeTimeout(timeout, TimeUnit.MILLISECONDS).readTimeout(timeout, TimeUnit.MILLISECONDS);
if (!tlsCheckCertificate) {
builder.sslSocketFactory(trustAllSslSocketFactory, (X509TrustManager) trustAllCerts[0]);
builder.hostnameVerifier(new HostnameVerifier() {
@Override
public boolean verify(String hostname, SSLSession session) {
return true;
}
});
}
if (!accept.isEmpty()) {
getCustomRequestHeaders().add(new String[] { "Accept", accept });
}
if (!acceptLanguage.isEmpty()) {
getCustomRequestHeaders().add(new String[] { "Accept-Language", acceptLanguage });
}
if (!acceptCharset.isEmpty()) {
getCustomRequestHeaders().add(new String[] { "Accept-Charset", acceptCharset });
}
if (useProxy) {
Proxy proxy = new Proxy(proxyType, new InetSocketAddress(proxyHost, proxyPort));
String proxyUsername = conf.get("http.proxy.username");
if (proxyUsername == null) {
ProxySelector selector = new ProxySelector() {
@SuppressWarnings("serial")
private final List<Proxy> noProxyList = new ArrayList<Proxy>() {
{
add(Proxy.NO_PROXY);
}
};
@SuppressWarnings("serial")
private final List<Proxy> proxyList = new ArrayList<Proxy>() {
{
add(proxy);
}
};
@Override
public List<Proxy> select(URI uri) {
if (useProxy(uri)) {
return proxyList;
}
return noProxyList;
}
@Override
public void connectFailed(URI uri, SocketAddress sa, IOException ioe) {
LOG.error("Connection to proxy failed for {}: {}", uri, ioe);
}
};
builder.proxySelector(selector);
} else {
/*
* NOTE: the proxy exceptions list does NOT work with proxy
* username/password because an okhttp3 bug
* (https://github.com/square/okhttp/issues/3995) when using the
* ProxySelector class with proxy auth. If a proxy username is present,
* the configured proxy will be used for ALL requests.
*/
if (proxyException.size() > 0) {
LOG.warn("protocol-okhttp does not respect 'http.proxy.exception.list' setting when " + "'http.proxy.username' is set. This is a limitation of the current okhttp3 " + "implementation, see NUTCH-2636");
}
builder.proxy(proxy);
String proxyPassword = conf.get("http.proxy.password");
Authenticator proxyAuthenticator = new Authenticator() {
@Override
public Request authenticate(okhttp3.Route route, okhttp3.Response response) throws IOException {
String credential = okhttp3.Credentials.basic(proxyUsername, proxyPassword);
return response.request().newBuilder().header("Proxy-Authorization", credential).build();
}
};
builder.proxyAuthenticator(proxyAuthenticator);
}
}
if (storeIPAddress || storeHttpHeaders || storeHttpRequest) {
builder.addNetworkInterceptor(new HTTPHeadersInterceptor());
}
// enable support for Brotli compression (Content-Encoding)
builder.addInterceptor(BrotliInterceptor.INSTANCE);
client = builder.build();
}
use of org.apache.nutch.net.protocols.Response in project nutch by apache.
the class HttpBase method getProtocolOutput.
@Override
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
String urlString = url.toString();
try {
URL u = new URL(urlString);
long startTime = System.currentTimeMillis();
// make a request
Response response = getResponse(u, datum, false);
if (this.responseTime) {
int elapsedTime = (int) (System.currentTimeMillis() - startTime);
datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
}
int code = response.getCode();
datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text(Integer.toString(code)));
byte[] content = response.getContent();
Content c = new Content(u.toString(), u.toString(), (content == null ? EMPTY_CONTENT : content), response.getHeader("Content-Type"), response.getHeaders(), mimeTypes);
if (code == 200) {
// return it
return new ProtocolOutput(c);
} else if (code >= 300 && code < 400) {
// handle redirect
String location = response.getHeader("Location");
// some broken servers, such as MS IIS, use lowercase header name...
if (location == null)
location = response.getHeader("location");
if (location == null)
location = "";
u = new URL(u, location);
int protocolStatusCode;
switch(code) {
case // multiple choices, preferred value in Location
300:
protocolStatusCode = ProtocolStatus.MOVED;
break;
// moved permanently
case 301:
case // use proxy (Location is URL of proxy)
305:
protocolStatusCode = ProtocolStatus.MOVED;
break;
// found (temporarily moved)
case 302:
// see other (redirect after POST)
case 303:
case // temporary redirect
307:
protocolStatusCode = ProtocolStatus.TEMP_MOVED;
break;
case // not modified
304:
protocolStatusCode = ProtocolStatus.NOTMODIFIED;
break;
default:
protocolStatusCode = ProtocolStatus.MOVED;
}
// handle this in the higher layer.
return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
} else if (code == 400) {
// bad request, mark as GONE
if (logger.isTraceEnabled()) {
logger.trace("400 Bad request: " + u);
}
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
} else if (code == 401) {
// provided.
if (logger.isTraceEnabled()) {
logger.trace("401 Authentication Required");
}
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: " + urlString));
} else if (code == 404) {
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
} else if (code == 410) {
// permanently GONE
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + u));
} else {
return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
}
} catch (Throwable e) {
if (logger.isDebugEnabled() || !logUtil.logShort(e)) {
logger.error("Failed to get protocol output", e);
} else {
logger.error("Failed to get protocol output: {}", e.getClass().getName());
}
return new ProtocolOutput(null, new ProtocolStatus(e));
}
}
Aggregations