Search in sources :

Example 11 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestMoreIndexingFilter method testContentDispositionTitle.

@Test
public void testContentDispositionTitle() throws IndexingException {
    Configuration conf = NutchConfiguration.create();
    Metadata metadata = new Metadata();
    metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    Text url = new Text("http://www.example.com/");
    ParseImpl parseImpl = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
    NutchDocument doc = new NutchDocument();
    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
    Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
    /* NUTCH-1140: do not add second title to avoid a multi-valued title field */
    doc = new NutchDocument();
    doc.add("title", "title");
    doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
    Assert.assertEquals("do not add second title by content-disposition", "title", doc.getFieldValue("title"));
}
Also used : Outlink(org.apache.nutch.parse.Outlink) ParseStatus(org.apache.nutch.parse.ParseStatus) NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ParseData(org.apache.nutch.parse.ParseData) NutchDocument(org.apache.nutch.indexer.NutchDocument) Metadata(org.apache.nutch.metadata.Metadata) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) Test(org.junit.Test)

Example 12 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestMoreIndexingFilter method testNoParts.

/**
 * @since NUTCH-901
 */
@Test
public void testNoParts() {
    Configuration conf = NutchConfiguration.create();
    conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
    MoreIndexingFilter filter = new MoreIndexingFilter();
    filter.setConf(conf);
    Assert.assertNotNull(filter);
    NutchDocument doc = new NutchDocument();
    ParseImpl parse = new ParseImpl("foo bar", new ParseData());
    try {
        filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
    } catch (Exception e) {
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
    Assert.assertNotNull(doc);
    Assert.assertTrue(doc.getFieldNames().contains("type"));
    Assert.assertEquals(1, doc.getField("type").getValues().size());
    Assert.assertEquals("text/html", doc.getFieldValue("type"));
}
Also used : NutchConfiguration(org.apache.nutch.util.NutchConfiguration) Configuration(org.apache.hadoop.conf.Configuration) NutchDocument(org.apache.nutch.indexer.NutchDocument) ParseData(org.apache.nutch.parse.ParseData) ParseImpl(org.apache.nutch.parse.ParseImpl) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) Inlinks(org.apache.nutch.crawl.Inlinks) IndexingException(org.apache.nutch.indexer.IndexingException) Test(org.junit.Test)

Example 13 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class HttpRobotRulesParser method getRobotRulesSet.

/**
 * Get the rules from robots.txt which applies for the given {@code url}.
 * Robot rules are cached for a unique combination of host, protocol, and
 * port. If no rules are found in the cache, a HTTP request is send to fetch
 * {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
 * rules are cached to avoid re-fetching and re-parsing it again.
 *
 * @param http
 *          The {@link Protocol} object
 * @param url
 *          URL
 * @param robotsTxtContent
 *          container to store responses when fetching the robots.txt file for
 *          debugging or archival purposes. Instead of a robots.txt file, it
 *          may include redirects or an error page (404, etc.). Response
 *          {@link Content} is appended to the passed list. If null is passed
 *          nothing is stored.
 *
 * @return robotRules A {@link BaseRobotRules} object for the rules
 */
@Override
public BaseRobotRules getRobotRulesSet(Protocol http, URL url, List<Content> robotsTxtContent) {
    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
        LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
    }
    String cacheKey = getCacheKey(url);
    BaseRobotRules robotRules = CACHE.get(cacheKey);
    if (robotRules != null) {
        // cached rule
        return robotRules;
    } else if (LOG.isTraceEnabled()) {
        LOG.trace("cache miss " + url);
    }
    boolean cacheRule = true;
    URL redir = null;
    if (isWhiteListed(url)) {
        // check in advance whether a host is whitelisted
        // (we do not need to fetch robots.txt)
        robotRules = EMPTY_RULES;
        LOG.info("Whitelisted host found for: {}", url);
        LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", url.getHost());
    } else {
        try {
            URL robotsUrl = new URL(url, "/robots.txt");
            Response response = ((HttpBase) http).getResponse(robotsUrl, new CrawlDatum(), true);
            if (robotsTxtContent != null) {
                addRobotsContent(robotsTxtContent, robotsUrl, response);
            }
            // try one level of redirection ?
            if (response.getCode() == 301 || response.getCode() == 302) {
                String redirection = response.getHeader("Location");
                if (redirection == null) {
                    // some versions of MS IIS are known to mangle this header
                    redirection = response.getHeader("location");
                }
                if (redirection != null) {
                    if (!redirection.startsWith("http")) {
                        // RFC says it should be absolute, but apparently it isn't
                        redir = new URL(url, redirection);
                    } else {
                        redir = new URL(redirection);
                    }
                    response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), true);
                    if (robotsTxtContent != null) {
                        addRobotsContent(robotsTxtContent, redir, response);
                    }
                }
            }
            if (// found rules: parse them
            response.getCode() == 200)
                robotRules = parseRules(url.toString(), response.getContent(), response.getHeader("Content-Type"), agentNames);
            else if ((response.getCode() == 403) && (!allowForbidden))
                // use forbid all
                robotRules = FORBID_ALL_RULES;
            else if (response.getCode() >= 500) {
                // try again later to fetch robots.txt
                cacheRule = false;
                robotRules = EMPTY_RULES;
            } else
                // use default rules
                robotRules = EMPTY_RULES;
        } catch (Throwable t) {
            if (LOG.isInfoEnabled()) {
                LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
            }
            // try again later to fetch robots.txt
            cacheRule = false;
            robotRules = EMPTY_RULES;
        }
    }
    if (cacheRule) {
        // cache rules for host
        CACHE.put(cacheKey, robotRules);
        if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
            // cache also for the redirected host
            CACHE.put(getCacheKey(redir), robotRules);
        }
    }
    return robotRules;
}
Also used : Response(org.apache.nutch.net.protocols.Response) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) BaseRobotRules(crawlercommons.robots.BaseRobotRules) URL(java.net.URL)

Example 14 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestProtocolHttp method fetchPage.

/**
 * Fetches the specified <code>page</code> from the local Jetty server and
 * checks whether the HTTP response status code matches with the expected
 * code. Also use jsp pages for redirection.
 *
 * @param page
 *          Page to be fetched.
 * @param expectedCode
 *          HTTP response status code expected while fetching the page.
 */
private void fetchPage(String page, int expectedCode) throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    CrawlDatum crawlDatum = new CrawlDatum();
    Response response = http.getResponse(url, crawlDatum, true);
    ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), crawlDatum);
    Content content = out.getContent();
    assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
    if (page.compareTo("/nonexists.html") != 0 && page.compareTo("/brokenpage.jsp") != 0 && page.compareTo("/redirection") != 0) {
        assertEquals("ContentType " + url, "text/html", content.getContentType());
    }
}
Also used : Response(org.apache.nutch.net.protocols.Response) ProtocolOutput(org.apache.nutch.protocol.ProtocolOutput) Content(org.apache.nutch.protocol.Content) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) Text(org.apache.hadoop.io.Text) URL(java.net.URL)

Example 15 with CrawlDatum

use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.

the class TestProtocolHttpClient method fetchPage.

/**
 * Fetches the specified <code>page</code> from the local Jetty server and
 * checks whether the HTTP response status code matches with the expected
 * code.
 *
 * @param page
 *          Page to be fetched.
 * @param expectedCode
 *          HTTP response status code expected while fetching the page.
 * @throws Exception
 *           When an error occurs or test case fails.
 */
private void fetchPage(String page, int expectedCode) throws Exception {
    URL url = new URL("http", "127.0.0.1", port, page);
    Response response = null;
    response = http.getResponse(url, new CrawlDatum(), true);
    int code = response.getCode();
    Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code);
}
Also used : Response(org.apache.nutch.net.protocols.Response) CrawlDatum(org.apache.nutch.crawl.CrawlDatum) URL(java.net.URL)

Aggregations

CrawlDatum (org.apache.nutch.crawl.CrawlDatum)66 Text (org.apache.hadoop.io.Text)60 Test (org.junit.Test)31 Inlinks (org.apache.nutch.crawl.Inlinks)25 Configuration (org.apache.hadoop.conf.Configuration)24 ParseData (org.apache.nutch.parse.ParseData)22 ParseImpl (org.apache.nutch.parse.ParseImpl)21 NutchDocument (org.apache.nutch.indexer.NutchDocument)20 NutchConfiguration (org.apache.nutch.util.NutchConfiguration)20 Content (org.apache.nutch.protocol.Content)19 Parse (org.apache.nutch.parse.Parse)15 Metadata (org.apache.nutch.metadata.Metadata)14 ParseStatus (org.apache.nutch.parse.ParseStatus)14 ParseUtil (org.apache.nutch.parse.ParseUtil)13 Protocol (org.apache.nutch.protocol.Protocol)13 ProtocolFactory (org.apache.nutch.protocol.ProtocolFactory)13 URL (java.net.URL)11 Outlink (org.apache.nutch.parse.Outlink)11 IOException (java.io.IOException)7 ArrayList (java.util.ArrayList)5