Search in sources :

Example 1 with Response

use of in project nutch by apache.

the class TestProtocolHttp method fetchPage.

 * Fetches the specified <code>page</code> from the local Jetty server and
 * checks whether the HTTP response status code matches with the expected
 * code. Also use jsp pages for redirection.
 * @param page
 *          Page to be fetched.
 * @param expectedCode
 *          HTTP response status code expected while fetching the page.
private void fetchPage(String page, int expectedCode) throws Exception {
    URL url = new URL("http", "", port, page);
    CrawlDatum crawlDatum = new CrawlDatum();
    Response response = http.getResponse(url, crawlDatum, true);
    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), crawlDatum);
    Content content = out.getContent();
    assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
    if (page.compareTo("/nonexists.html") != 0 && page.compareTo("/brokenpage.jsp") != 0 && page.compareTo("/redirection") != 0) {
        assertEquals("ContentType " + url, "text/html", content.getContentType());
Also used : Response( ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text( URL(

Example 2 with Response

use of in project nutch by apache.

the class TestProtocolHttpClient method fetchPage.

 * Fetches the specified <code>page</code> from the local Jetty server and
 * checks whether the HTTP response status code matches with the expected
 * code.
 * @param page
 *          Page to be fetched.
 * @param expectedCode
 *          HTTP response status code expected while fetching the page.
 * @throws Exception
 *           When an error occurs or test case fails.
private void fetchPage(String page, int expectedCode) throws Exception {
    URL url = new URL("http", "", port, page);
    Response response = null;
    response = http.getResponse(url, new CrawlDatum(), true);
    int code = response.getCode();
    Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code);
Also used : Response( CrawlDatum(org.apache.nutch.crawl.CrawlDatum) URL(

Example 3 with Response

use of in project nutch by apache.

the class HttpRobotRulesParser method getRobotRulesSet.

 * Get the rules from robots.txt which applies for the given {@code url}.
 * Robot rules are cached for a unique combination of host, protocol, and
 * port. If no rules are found in the cache, a HTTP request is send to fetch
 * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
 * rules are cached to avoid re-fetching and re-parsing it again.
 * @param http
 *          The {@link Protocol} object
 * @param url
 *          URL
 * @param robotsTxtContent
 *          container to store responses when fetching the robots.txt file for
 *          debugging or archival purposes. Instead of a robots.txt file, it
 *          may include redirects or an error page (404, etc.). Response
 *          {@link Content} is appended to the passed list. If null is passed
 *          nothing is stored.
 * @return robotRules A {@link BaseRobotRules} object for the rules
public BaseRobotRules getRobotRulesSet(Protocol http, URL url, List<Content> robotsTxtContent) {
    if (LOG.isTraceEnabled() && isAllowListed(url)) {
        LOG.trace("Ignoring robots.txt (host is allowlisted) for URL: {}", url);
    String cacheKey = getCacheKey(url);
    BaseRobotRules robotRules = CACHE.get(cacheKey);
    if (robotRules != null) {
        // cached rule
        return robotRules;
    } else if (LOG.isTraceEnabled()) {
        LOG.trace("cache miss {}", url);
    boolean cacheRule = true;
    URL redir = null;
    if (isAllowListed(url)) {
        // check in advance whether a host is allowlisted
        // (we do not need to fetch robots.txt)
        robotRules = EMPTY_RULES;"Allowlisted host found for: {}", url);"Ignoring robots.txt for all URLs from allowlisted host: {}", url.getHost());
    } else {
        try {
            URL robotsUrl = new URL(url, "/robots.txt");
            Response response = ((HttpBase) http).getResponse(robotsUrl, new CrawlDatum(), true);
            if (robotsTxtContent != null) {
                addRobotsContent(robotsTxtContent, robotsUrl, response);
            // try one level of redirection ?
            if (response.getCode() == 301 || response.getCode() == 302) {
                String redirection = response.getHeader("Location");
                if (redirection == null) {
                    // some versions of MS IIS are known to mangle this header
                    redirection = response.getHeader("location");
                if (redirection != null) {
                    if (!redirection.startsWith("http")) {
                        // RFC says it should be absolute, but apparently it isn't
                        redir = new URL(url, redirection);
                    } else {
                        redir = new URL(redirection);
                    response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), true);
                    if (robotsTxtContent != null) {
                        addRobotsContent(robotsTxtContent, redir, response);
            if (// found rules: parse them
            response.getCode() == 200)
                robotRules = parseRules(url.toString(), response.getContent(), response.getHeader("Content-Type"), agentNames);
            else if ((response.getCode() == 403) && (!allowForbidden))
                // use forbid all
                robotRules = FORBID_ALL_RULES;
            else if (response.getCode() >= 500) {
                // try again later to fetch robots.txt
                cacheRule = false;
                if (deferVisits503) {
                    // signal fetcher to suspend crawling for this host
                    robotRules = DEFER_VISIT_RULES;
                } else {
                    robotRules = EMPTY_RULES;
            } else {
                // use default rules
                robotRules = EMPTY_RULES;
        } catch (Throwable t) {
            if (LOG.isInfoEnabled()) {
      "Couldn't get robots.txt for " + url + ": " + t.toString());
            // try again later to fetch robots.txt
            cacheRule = false;
            robotRules = EMPTY_RULES;
    if (cacheRule) {
        // cache rules for host
        CACHE.put(cacheKey, robotRules);
        if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost()) && "/robots.txt".equals(redir.getFile())) {
            // cache also for the redirected host
            // if the URL path is /robots.txt
            CACHE.put(getCacheKey(redir), robotRules);
    return robotRules;
Also used : Response( CrawlDatum(org.apache.nutch.crawl.CrawlDatum) BaseRobotRules(crawlercommons.robots.BaseRobotRules) URL(

Example 4 with Response

use of in project nutch by apache.

the class OkHttp method setConf.

public void setConf(Configuration conf) {
    // protocols in order of preference
    List<okhttp3.Protocol> protocols = new ArrayList<>();
    if (useHttp2) {
    okhttp3.OkHttpClient.Builder builder = new OkHttpClient.Builder().protocols(// 
    false).connectTimeout(timeout, TimeUnit.MILLISECONDS).writeTimeout(timeout, TimeUnit.MILLISECONDS).readTimeout(timeout, TimeUnit.MILLISECONDS);
    if (!tlsCheckCertificate) {
        builder.sslSocketFactory(trustAllSslSocketFactory, (X509TrustManager) trustAllCerts[0]);
        builder.hostnameVerifier(new HostnameVerifier() {

            public boolean verify(String hostname, SSLSession session) {
                return true;
    if (!accept.isEmpty()) {
        getCustomRequestHeaders().add(new String[] { "Accept", accept });
    if (!acceptLanguage.isEmpty()) {
        getCustomRequestHeaders().add(new String[] { "Accept-Language", acceptLanguage });
    if (!acceptCharset.isEmpty()) {
        getCustomRequestHeaders().add(new String[] { "Accept-Charset", acceptCharset });
    if (useProxy) {
        Proxy proxy = new Proxy(proxyType, new InetSocketAddress(proxyHost, proxyPort));
        String proxyUsername = conf.get("http.proxy.username");
        if (proxyUsername == null) {
            ProxySelector selector = new ProxySelector() {

                private final List<Proxy> noProxyList = new ArrayList<Proxy>() {


                private final List<Proxy> proxyList = new ArrayList<Proxy>() {


                public List<Proxy> select(URI uri) {
                    if (useProxy(uri)) {
                        return proxyList;
                    return noProxyList;

                public void connectFailed(URI uri, SocketAddress sa, IOException ioe) {
                    LOG.error("Connection to proxy failed for {}: {}", uri, ioe);
        } else {
         * NOTE: the proxy exceptions list does NOT work with proxy
         * username/password because an okhttp3 bug
         * ( when using the
         * ProxySelector class with proxy auth. If a proxy username is present,
         * the configured proxy will be used for ALL requests.
            if (proxyException.size() > 0) {
                LOG.warn("protocol-okhttp does not respect 'http.proxy.exception.list' setting when " + "'http.proxy.username' is set. This is a limitation of the current okhttp3 " + "implementation, see NUTCH-2636");
            String proxyPassword = conf.get("http.proxy.password");
            Authenticator proxyAuthenticator = new Authenticator() {

                public Request authenticate(okhttp3.Route route, okhttp3.Response response) throws IOException {
                    String credential = okhttp3.Credentials.basic(proxyUsername, proxyPassword);
                    return response.request().newBuilder().header("Proxy-Authorization", credential).build();
    if (storeIPAddress || storeHttpHeaders || storeHttpRequest) {
        builder.addNetworkInterceptor(new HTTPHeadersInterceptor());
    // enable support for Brotli compression (Content-Encoding)
    client =;
Also used : OkHttpClient(okhttp3.OkHttpClient) InetSocketAddress( ArrayList(java.util.ArrayList) SSLSession( IOException( URI( HostnameVerifier( ProxySelector( Response( Proxy( ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) Protocol(okhttp3.Protocol) SocketAddress( InetSocketAddress( Authenticator(okhttp3.Authenticator)

Example 5 with Response

use of in project nutch by apache.

the class HttpBase method getProtocolOutput.

public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
    String urlString = url.toString();
    try {
        URL u = new URL(urlString);
        long startTime = System.currentTimeMillis();
        // make a request
        Response response = getResponse(u, datum, false);
        if (this.responseTime) {
            int elapsedTime = (int) (System.currentTimeMillis() - startTime);
            datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
        int code = response.getCode();
        datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text(Integer.toString(code)));
        byte[] content = response.getContent();
        Content c = new Content(u.toString(), u.toString(), (content == null ? EMPTY_CONTENT : content), response.getHeader("Content-Type"), response.getHeaders(), mimeTypes);
        if (code == 200) {
            // return it
            return new ProtocolOutput(c);
        } else if (code >= 300 && code < 400) {
            // handle redirect
            String location = response.getHeader("Location");
            // some broken servers, such as MS IIS, use lowercase header name...
            if (location == null)
                location = response.getHeader("location");
            if (location == null)
                location = "";
            u = new URL(u, location);
            int protocolStatusCode;
            switch(code) {
                case // multiple choices, preferred value in Location
                    protocolStatusCode = ProtocolStatus.MOVED;
                // moved permanently
                case 301:
                case // use proxy (Location is URL of proxy)
                    protocolStatusCode = ProtocolStatus.MOVED;
                // found (temporarily moved)
                case 302:
                // see other (redirect after POST)
                case 303:
                case // temporary redirect
                    protocolStatusCode = ProtocolStatus.TEMP_MOVED;
                case // not modified
                    protocolStatusCode = ProtocolStatus.NOTMODIFIED;
                    protocolStatusCode = ProtocolStatus.MOVED;
            // handle this in the higher layer.
            return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
        } else if (code == 400) {
            // bad request, mark as GONE
            if (logger.isTraceEnabled()) {
                logger.trace("400 Bad request: " + u);
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
        } else if (code == 401) {
            // provided.
            if (logger.isTraceEnabled()) {
                logger.trace("401 Authentication Required");
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: " + urlString));
        } else if (code == 404) {
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
        } else if (code == 410) {
            // permanently GONE
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + u));
        } else {
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
    } catch (Throwable e) {
        if (logger.isDebugEnabled() || !logUtil.logShort(e)) {
            logger.error("Failed to get protocol output", e);
        } else {
            logger.error("Failed to get protocol output: {}", e.getClass().getName());
        return new ProtocolOutput(null, new ProtocolStatus(e));
Also used : Response( ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) Text( URL( IntWritable( ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)


Response ( URL ( CrawlDatum (org.apache.nutch.crawl.CrawlDatum)3 Text ( Content (org.apache.nutch.protocol.Content)2 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)2 BaseRobotRules (crawlercommons.robots.BaseRobotRules)1 IOException ( InetSocketAddress ( Proxy ( ProxySelector ( SocketAddress ( URI ( ArrayList (java.util.ArrayList)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 HostnameVerifier ( SSLSession ( Authenticator (okhttp3.Authenticator)1 OkHttpClient (okhttp3.OkHttpClient)1