Search in sources :

Example 1 with Response

use of org.apache.nutch.net.protocols.Response in project nutch by apache.

the class TestProtocolHttp method fetchPage.

/**
 * Fetches the specified <code>page</code> from the local Jetty server and
 * checks whether the HTTP response status code matches with the expected
 * code. Also use jsp pages for redirection.
 *
 * @param page
 *          Page to be fetched.
 * @param expectedCode
 *          HTTP response status code expected while fetching the page.
 */
private void fetchPage(String page, int expectedCode) throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    CrawlDatum crawlDatum = new CrawlDatum();
    Response response = http.getResponse(url, crawlDatum, true);
    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), crawlDatum);
    Content content = out.getContent();
    assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
    if (page.compareTo("/nonexists.html") != 0 && page.compareTo("/brokenpage.jsp") != 0 && page.compareTo("/redirection") != 0) {
        assertEquals("ContentType " + url, "text/html", content.getContentType());
    }
}
Also used : Response(org.apache.nutch.net.protocols.Response) ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) URL(java.net.URL)

Example 2 with Response

use of org.apache.nutch.net.protocols.Response in project nutch by apache.

the class TestProtocolHttpClient method fetchPage.

/**
 * Fetches the specified <code>page</code> from the local Jetty server and
 * checks whether the HTTP response status code matches with the expected
 * code.
 *
 * @param page
 *          Page to be fetched.
 * @param expectedCode
 *          HTTP response status code expected while fetching the page.
 * @throws Exception
 *           When an error occurs or test case fails.
 */
private void fetchPage(String page, int expectedCode) throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    Response response = null;
    response = http.getResponse(url, new CrawlDatum(), true);
    int code = response.getCode();
    Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code);
}
Also used : Response(org.apache.nutch.net.protocols.Response) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) URL(java.net.URL)

Example 3 with Response

use of org.apache.nutch.net.protocols.Response in project nutch by apache.

the class HttpRobotRulesParser method getRobotRulesSet.

/**
 * Get the rules from robots.txt which applies for the given {@code url}.
 * Robot rules are cached for a unique combination of host, protocol, and
 * port. If no rules are found in the cache, a HTTP request is send to fetch
 * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
 * rules are cached to avoid re-fetching and re-parsing it again.
 *
 * @param http
 *          The {@link Protocol} object
 * @param url
 *          URL
 * @param robotsTxtContent
 *          container to store responses when fetching the robots.txt file for
 *          debugging or archival purposes. Instead of a robots.txt file, it
 *          may include redirects or an error page (404, etc.). Response
 *          {@link Content} is appended to the passed list. If null is passed
 *          nothing is stored.
 *
 * @return robotRules A {@link BaseRobotRules} object for the rules
 */
@Override
public BaseRobotRules getRobotRulesSet(Protocol http, URL url, List<Content> robotsTxtContent) {
    if (LOG.isTraceEnabled() && isAllowListed(url)) {
        LOG.trace("Ignoring robots.txt (host is allowlisted) for URL: {}", url);
    }
    String cacheKey = getCacheKey(url);
    BaseRobotRules robotRules = CACHE.get(cacheKey);
    if (robotRules != null) {
        // cached rule
        return robotRules;
    } else if (LOG.isTraceEnabled()) {
        LOG.trace("cache miss {}", url);
    }
    boolean cacheRule = true;
    URL redir = null;
    if (isAllowListed(url)) {
        // check in advance whether a host is allowlisted
        // (we do not need to fetch robots.txt)
        robotRules = EMPTY_RULES;
        LOG.info("Allowlisted host found for: {}", url);
        LOG.info("Ignoring robots.txt for all URLs from allowlisted host: {}", url.getHost());
    } else {
        try {
            URL robotsUrl = new URL(url, "/robots.txt");
            Response response = ((HttpBase) http).getResponse(robotsUrl, new CrawlDatum(), true);
            if (robotsTxtContent != null) {
                addRobotsContent(robotsTxtContent, robotsUrl, response);
            }
            // try one level of redirection ?
            if (response.getCode() == 301 || response.getCode() == 302) {
                String redirection = response.getHeader("Location");
                if (redirection == null) {
                    // some versions of MS IIS are known to mangle this header
                    redirection = response.getHeader("location");
                }
                if (redirection != null) {
                    if (!redirection.startsWith("http")) {
                        // RFC says it should be absolute, but apparently it isn't
                        redir = new URL(url, redirection);
                    } else {
                        redir = new URL(redirection);
                    }
                    response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), true);
                    if (robotsTxtContent != null) {
                        addRobotsContent(robotsTxtContent, redir, response);
                    }
                }
            }
            if (// found rules: parse them
            response.getCode() == 200)
                robotRules = parseRules(url.toString(), response.getContent(), response.getHeader("Content-Type"), agentNames);
            else if ((response.getCode() == 403) && (!allowForbidden))
                // use forbid all
                robotRules = FORBID_ALL_RULES;
            else if (response.getCode() >= 500) {
                // try again later to fetch robots.txt
                cacheRule = false;
                if (deferVisits503) {
                    // signal fetcher to suspend crawling for this host
                    robotRules = DEFER_VISIT_RULES;
                } else {
                    robotRules = EMPTY_RULES;
                }
            } else {
                // use default rules
                robotRules = EMPTY_RULES;
            }
        } catch (Throwable t) {
            if (LOG.isInfoEnabled()) {
                LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
            }
            // try again later to fetch robots.txt
            cacheRule = false;
            robotRules = EMPTY_RULES;
        }
    }
    if (cacheRule) {
        // cache rules for host
        CACHE.put(cacheKey, robotRules);
        if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost()) && "/robots.txt".equals(redir.getFile())) {
            // cache also for the redirected host
            // if the URL path is /robots.txt
            CACHE.put(getCacheKey(redir), robotRules);
        }
    }
    return robotRules;
}
Also used : Response(org.apache.nutch.net.protocols.Response) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) BaseRobotRules(crawlercommons.robots.BaseRobotRules) URL(java.net.URL)

Example 4 with Response

use of org.apache.nutch.net.protocols.Response in project nutch by apache.

the class OkHttp method setConf.

@Override
public void setConf(Configuration conf) {
    super.setConf(conf);
    // protocols in order of preference
    List<okhttp3.Protocol> protocols = new ArrayList<>();
    if (useHttp2) {
        protocols.add(okhttp3.Protocol.HTTP_2);
    }
    protocols.add(okhttp3.Protocol.HTTP_1_1);
    okhttp3.OkHttpClient.Builder builder = new OkHttpClient.Builder().protocols(// 
    protocols).retryOnConnectionFailure(// 
    true).followRedirects(// 
    false).connectTimeout(timeout, TimeUnit.MILLISECONDS).writeTimeout(timeout, TimeUnit.MILLISECONDS).readTimeout(timeout, TimeUnit.MILLISECONDS);
    if (!tlsCheckCertificate) {
        builder.sslSocketFactory(trustAllSslSocketFactory, (X509TrustManager) trustAllCerts[0]);
        builder.hostnameVerifier(new HostnameVerifier() {

            @Override
            public boolean verify(String hostname, SSLSession session) {
                return true;
            }
        });
    }
    if (!accept.isEmpty()) {
        getCustomRequestHeaders().add(new String[] { "Accept", accept });
    }
    if (!acceptLanguage.isEmpty()) {
        getCustomRequestHeaders().add(new String[] { "Accept-Language", acceptLanguage });
    }
    if (!acceptCharset.isEmpty()) {
        getCustomRequestHeaders().add(new String[] { "Accept-Charset", acceptCharset });
    }
    if (useProxy) {
        Proxy proxy = new Proxy(proxyType, new InetSocketAddress(proxyHost, proxyPort));
        String proxyUsername = conf.get("http.proxy.username");
        if (proxyUsername == null) {
            ProxySelector selector = new ProxySelector() {

                @SuppressWarnings("serial")
                private final List<Proxy> noProxyList = new ArrayList<Proxy>() {

                    {
                        add(Proxy.NO_PROXY);
                    }
                };

                @SuppressWarnings("serial")
                private final List<Proxy> proxyList = new ArrayList<Proxy>() {

                    {
                        add(proxy);
                    }
                };

                @Override
                public List<Proxy> select(URI uri) {
                    if (useProxy(uri)) {
                        return proxyList;
                    }
                    return noProxyList;
                }

                @Override
                public void connectFailed(URI uri, SocketAddress sa, IOException ioe) {
                    LOG.error("Connection to proxy failed for {}: {}", uri, ioe);
                }
            };
            builder.proxySelector(selector);
        } else {
            /*
         * NOTE: the proxy exceptions list does NOT work with proxy
         * username/password because an okhttp3 bug
         * (https://github.com/square/okhttp/issues/3995) when using the
         * ProxySelector class with proxy auth. If a proxy username is present,
         * the configured proxy will be used for ALL requests.
         */
            if (proxyException.size() > 0) {
                LOG.warn("protocol-okhttp does not respect 'http.proxy.exception.list' setting when " + "'http.proxy.username' is set. This is a limitation of the current okhttp3 " + "implementation, see NUTCH-2636");
            }
            builder.proxy(proxy);
            String proxyPassword = conf.get("http.proxy.password");
            Authenticator proxyAuthenticator = new Authenticator() {

                @Override
                public Request authenticate(okhttp3.Route route, okhttp3.Response response) throws IOException {
                    String credential = okhttp3.Credentials.basic(proxyUsername, proxyPassword);
                    return response.request().newBuilder().header("Proxy-Authorization", credential).build();
                }
            };
            builder.proxyAuthenticator(proxyAuthenticator);
        }
    }
    if (storeIPAddress || storeHttpHeaders || storeHttpRequest) {
        builder.addNetworkInterceptor(new HTTPHeadersInterceptor());
    }
    // enable support for Brotli compression (Content-Encoding)
    builder.addInterceptor(BrotliInterceptor.INSTANCE);
    client = builder.build();
}
Also used : OkHttpClient(okhttp3.OkHttpClient) InetSocketAddress(java.net.InetSocketAddress) ArrayList(java.util.ArrayList) SSLSession(javax.net.ssl.SSLSession) IOException(java.io.IOException) URI(java.net.URI) HostnameVerifier(javax.net.ssl.HostnameVerifier) ProxySelector(java.net.ProxySelector) Response(org.apache.nutch.net.protocols.Response) Proxy(java.net.Proxy) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList) List(java.util.List) Protocol(okhttp3.Protocol) SocketAddress(java.net.SocketAddress) InetSocketAddress(java.net.InetSocketAddress) Authenticator(okhttp3.Authenticator)

Example 5 with Response

use of org.apache.nutch.net.protocols.Response in project nutch by apache.

the class HttpBase method getProtocolOutput.

@Override
public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
    String urlString = url.toString();
    try {
        URL u = new URL(urlString);
        long startTime = System.currentTimeMillis();
        // make a request
        Response response = getResponse(u, datum, false);
        if (this.responseTime) {
            int elapsedTime = (int) (System.currentTimeMillis() - startTime);
            datum.getMetaData().put(RESPONSE_TIME, new IntWritable(elapsedTime));
        }
        int code = response.getCode();
        datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY, new Text(Integer.toString(code)));
        byte[] content = response.getContent();
        Content c = new Content(u.toString(), u.toString(), (content == null ? EMPTY_CONTENT : content), response.getHeader("Content-Type"), response.getHeaders(), mimeTypes);
        if (code == 200) {
            // return it
            return new ProtocolOutput(c);
        } else if (code >= 300 && code < 400) {
            // handle redirect
            String location = response.getHeader("Location");
            // some broken servers, such as MS IIS, use lowercase header name...
            if (location == null)
                location = response.getHeader("location");
            if (location == null)
                location = "";
            u = new URL(u, location);
            int protocolStatusCode;
            switch(code) {
                case // multiple choices, preferred value in Location
                300:
                    protocolStatusCode = ProtocolStatus.MOVED;
                    break;
                // moved permanently
                case 301:
                case // use proxy (Location is URL of proxy)
                305:
                    protocolStatusCode = ProtocolStatus.MOVED;
                    break;
                // found (temporarily moved)
                case 302:
                // see other (redirect after POST)
                case 303:
                case // temporary redirect
                307:
                    protocolStatusCode = ProtocolStatus.TEMP_MOVED;
                    break;
                case // not modified
                304:
                    protocolStatusCode = ProtocolStatus.NOTMODIFIED;
                    break;
                default:
                    protocolStatusCode = ProtocolStatus.MOVED;
            }
            // handle this in the higher layer.
            return new ProtocolOutput(c, new ProtocolStatus(protocolStatusCode, u));
        } else if (code == 400) {
            // bad request, mark as GONE
            if (logger.isTraceEnabled()) {
                logger.trace("400 Bad request: " + u);
            }
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, u));
        } else if (code == 401) {
            // provided.
            if (logger.isTraceEnabled()) {
                logger.trace("401 Authentication Required");
            }
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.ACCESS_DENIED, "Authentication required: " + urlString));
        } else if (code == 404) {
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.NOTFOUND, u));
        } else if (code == 410) {
            // permanently GONE
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.GONE, "Http: " + code + " url=" + u));
        } else {
            return new ProtocolOutput(c, new ProtocolStatus(ProtocolStatus.EXCEPTION, "Http code=" + code + ", url=" + u));
        }
    } catch (Throwable e) {
        if (logger.isDebugEnabled() || !logUtil.logShort(e)) {
            logger.error("Failed to get protocol output", e);
        } else {
            logger.error("Failed to get protocol output: {}", e.getClass().getName());
        }
        return new ProtocolOutput(null, new ProtocolStatus(e));
    }
}
Also used : Response(org.apache.nutch.net.protocols.Response) ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) Text(org.apache.hadoop.io.Text) URL(java.net.URL) IntWritable(org.apache.hadoop.io.IntWritable) ProtocolStatus(org.apache.nutch.protocol.ProtocolStatus)

Aggregations

Response (org.apache.nutch.net.protocols.Response)5 URL (java.net.URL)4 CrawlDatum (org.apache.nutch.crawl.CrawlDatum)3 Text (org.apache.hadoop.io.Text)2 Content (org.apache.nutch.protocol.Content)2 ProtocolOutput (org.apache.nutch.protocol.ProtocolOutput)2 BaseRobotRules (crawlercommons.robots.BaseRobotRules)1 IOException (java.io.IOException)1 InetSocketAddress (java.net.InetSocketAddress)1 Proxy (java.net.Proxy)1 ProxySelector (java.net.ProxySelector)1 SocketAddress (java.net.SocketAddress)1 URI (java.net.URI)1 ArrayList (java.util.ArrayList)1 LinkedList (java.util.LinkedList)1 List (java.util.List)1 HostnameVerifier (javax.net.ssl.HostnameVerifier)1 SSLSession (javax.net.ssl.SSLSession)1 Authenticator (okhttp3.Authenticator)1 OkHttpClient (okhttp3.OkHttpClient)1