use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestMoreIndexingFilter method testContentDispositionTitle.
@Test
public void testContentDispositionTitle() throws IndexingException {
Configuration conf = NutchConfiguration.create();
Metadata metadata = new Metadata();
metadata.add(Response.CONTENT_DISPOSITION, "filename=filename.ext");
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
Text url = new Text("http://www.example.com/");
ParseImpl parseImpl = new ParseImpl("text", new ParseData(new ParseStatus(), "title", new Outlink[0], metadata));
NutchDocument doc = new NutchDocument();
doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
Assert.assertEquals("content-disposition not detected", "filename.ext", doc.getFieldValue("title"));
/* NUTCH-1140: do not add second title to avoid a multi-valued title field */
doc = new NutchDocument();
doc.add("title", "title");
doc = filter.filter(doc, parseImpl, url, new CrawlDatum(), new Inlinks());
Assert.assertEquals("do not add second title by content-disposition", "title", doc.getFieldValue("title"));
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestMoreIndexingFilter method testNoParts.
/**
* @since NUTCH-901
*/
@Test
public void testNoParts() {
Configuration conf = NutchConfiguration.create();
conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);
MoreIndexingFilter filter = new MoreIndexingFilter();
filter.setConf(conf);
Assert.assertNotNull(filter);
NutchDocument doc = new NutchDocument();
ParseImpl parse = new ParseImpl("foo bar", new ParseData());
try {
filter.filter(doc, parse, new Text("http://nutch.apache.org/index.html"), new CrawlDatum(), new Inlinks());
} catch (Exception e) {
e.printStackTrace();
Assert.fail(e.getMessage());
}
Assert.assertNotNull(doc);
Assert.assertTrue(doc.getFieldNames().contains("type"));
Assert.assertEquals(1, doc.getField("type").getValues().size());
Assert.assertEquals("text/html", doc.getFieldValue("type"));
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class HttpRobotRulesParser method getRobotRulesSet.
/**
* Get the rules from robots.txt which applies for the given {@code url}.
* Robot rules are cached for a unique combination of host, protocol, and
* port. If no rules are found in the cache, a HTTP request is send to fetch
* {{protocol://host:port/robots.txt}}. The robots.txt is then parsed and the
* rules are cached to avoid re-fetching and re-parsing it again.
*
* @param http
* The {@link Protocol} object
* @param url
* URL
* @param robotsTxtContent
* container to store responses when fetching the robots.txt file for
* debugging or archival purposes. Instead of a robots.txt file, it
* may include redirects or an error page (404, etc.). Response
* {@link Content} is appended to the passed list. If null is passed
* nothing is stored.
*
* @return robotRules A {@link BaseRobotRules} object for the rules
*/
@Override
public BaseRobotRules getRobotRulesSet(Protocol http, URL url, List<Content> robotsTxtContent) {
if (LOG.isTraceEnabled() && isWhiteListed(url)) {
LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
}
String cacheKey = getCacheKey(url);
BaseRobotRules robotRules = CACHE.get(cacheKey);
if (robotRules != null) {
// cached rule
return robotRules;
} else if (LOG.isTraceEnabled()) {
LOG.trace("cache miss " + url);
}
boolean cacheRule = true;
URL redir = null;
if (isWhiteListed(url)) {
// check in advance whether a host is whitelisted
// (we do not need to fetch robots.txt)
robotRules = EMPTY_RULES;
LOG.info("Whitelisted host found for: {}", url);
LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", url.getHost());
} else {
try {
URL robotsUrl = new URL(url, "/robots.txt");
Response response = ((HttpBase) http).getResponse(robotsUrl, new CrawlDatum(), true);
if (robotsTxtContent != null) {
addRobotsContent(robotsTxtContent, robotsUrl, response);
}
// try one level of redirection ?
if (response.getCode() == 301 || response.getCode() == 302) {
String redirection = response.getHeader("Location");
if (redirection == null) {
// some versions of MS IIS are known to mangle this header
redirection = response.getHeader("location");
}
if (redirection != null) {
if (!redirection.startsWith("http")) {
// RFC says it should be absolute, but apparently it isn't
redir = new URL(url, redirection);
} else {
redir = new URL(redirection);
}
response = ((HttpBase) http).getResponse(redir, new CrawlDatum(), true);
if (robotsTxtContent != null) {
addRobotsContent(robotsTxtContent, redir, response);
}
}
}
if (// found rules: parse them
response.getCode() == 200)
robotRules = parseRules(url.toString(), response.getContent(), response.getHeader("Content-Type"), agentNames);
else if ((response.getCode() == 403) && (!allowForbidden))
// use forbid all
robotRules = FORBID_ALL_RULES;
else if (response.getCode() >= 500) {
// try again later to fetch robots.txt
cacheRule = false;
robotRules = EMPTY_RULES;
} else
// use default rules
robotRules = EMPTY_RULES;
} catch (Throwable t) {
if (LOG.isInfoEnabled()) {
LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
}
// try again later to fetch robots.txt
cacheRule = false;
robotRules = EMPTY_RULES;
}
}
if (cacheRule) {
// cache rules for host
CACHE.put(cacheKey, robotRules);
if (redir != null && !redir.getHost().equalsIgnoreCase(url.getHost())) {
// cache also for the redirected host
CACHE.put(getCacheKey(redir), robotRules);
}
}
return robotRules;
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestProtocolHttp method fetchPage.
/**
* Fetches the specified <code>page</code> from the local Jetty server and
* checks whether the HTTP response status code matches with the expected
* code. Also use jsp pages for redirection.
*
* @param page
* Page to be fetched.
* @param expectedCode
* HTTP response status code expected while fetching the page.
*/
private void fetchPage(String page, int expectedCode) throws Exception {
URL url = new URL("http", "127.0.0.1", port, page);
CrawlDatum crawlDatum = new CrawlDatum();
Response response = http.getResponse(url, crawlDatum, true);
ProtocolOutput out = http.getProtocolOutput(new Text(url.toString()), crawlDatum);
Content content = out.getContent();
assertEquals("HTTP Status Code for " + url, expectedCode, response.getCode());
if (page.compareTo("/nonexists.html") != 0 && page.compareTo("/brokenpage.jsp") != 0 && page.compareTo("/redirection") != 0) {
assertEquals("ContentType " + url, "text/html", content.getContentType());
}
}
use of org.apache.nutch.crawl.CrawlDatum in project nutch by apache.
the class TestProtocolHttpClient method fetchPage.
/**
* Fetches the specified <code>page</code> from the local Jetty server and
* checks whether the HTTP response status code matches with the expected
* code.
*
* @param page
* Page to be fetched.
* @param expectedCode
* HTTP response status code expected while fetching the page.
* @throws Exception
* When an error occurs or test case fails.
*/
private void fetchPage(String page, int expectedCode) throws Exception {
URL url = new URL("http", "127.0.0.1", port, page);
Response response = null;
response = http.getResponse(url, new CrawlDatum(), true);
int code = response.getCode();
Assert.assertEquals("HTTP Status Code for " + url, expectedCode, code);
}
Aggregations