use of org.codelibs.fess.crawler.entity.RobotsTxt in project fess-crawler by codelibs.
the class HcHttpClient method processRobotsTxt.
protected void processRobotsTxt(final String url) {
if (StringUtil.isBlank(url)) {
throw new CrawlerSystemException("url is null or empty.");
}
if (robotsTxtHelper == null || !robotsTxtHelper.isEnabled()) {
// not support robots.txt
return;
}
// crawler context
final CrawlerContext crawlerContext = CrawlingParameterUtil.getCrawlerContext();
if (crawlerContext == null) {
// wrong state
return;
}
final int idx = url.indexOf('/', url.indexOf("://") + 3);
String hostUrl;
if (idx >= 0) {
hostUrl = url.substring(0, idx);
} else {
hostUrl = url;
}
final String robotTxtUrl = hostUrl + "/robots.txt";
// check url
if (crawlerContext.getRobotsTxtUrlSet().contains(robotTxtUrl)) {
if (logger.isDebugEnabled()) {
logger.debug(robotTxtUrl + " is already visited.");
}
return;
}
if (logger.isInfoEnabled()) {
logger.info("Checking URL: " + robotTxtUrl);
}
// add url to a set
crawlerContext.getRobotsTxtUrlSet().add(robotTxtUrl);
final HttpGet httpGet = new HttpGet(robotTxtUrl);
// request header
for (final Header header : requestHeaderList) {
httpGet.addHeader(header);
}
HttpEntity httpEntity = null;
try {
// get a content
final HttpResponse response = executeHttpClient(httpGet);
httpEntity = response.getEntity();
final int httpStatusCode = response.getStatusLine().getStatusCode();
if (httpStatusCode == 200) {
// check file size
final Header contentLengthHeader = response.getFirstHeader("Content-Length");
if (contentLengthHeader != null) {
final String value = contentLengthHeader.getValue();
final long contentLength = Long.parseLong(value);
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength("text/plain");
if (contentLength > maxLength) {
throw new MaxLengthExceededException("The content length (" + contentLength + " byte) is over " + maxLength + " byte. The url is " + robotTxtUrl);
}
}
}
if (httpEntity != null) {
final RobotsTxt robotsTxt = robotsTxtHelper.parse(httpEntity.getContent());
if (robotsTxt != null) {
final String[] sitemaps = robotsTxt.getSitemaps();
if (sitemaps.length > 0) {
crawlerContext.addSitemaps(sitemaps);
}
final RobotsTxt.Directive directive = robotsTxt.getMatchedDirective(userAgent);
if (directive != null) {
if (useRobotsTxtDisallows) {
for (String urlPattern : directive.getDisallows()) {
if (StringUtil.isNotBlank(urlPattern)) {
urlPattern = convertRobotsTxtPathPattern(urlPattern);
crawlerContext.getUrlFilter().addExclude(hostUrl + urlPattern);
}
}
}
if (useRobotsTxtAllows) {
for (String urlPattern : directive.getAllows()) {
if (StringUtil.isNotBlank(urlPattern)) {
urlPattern = convertRobotsTxtPathPattern(urlPattern);
crawlerContext.getUrlFilter().addInclude(hostUrl + urlPattern);
}
}
}
}
}
}
}
} catch (final CrawlerSystemException e) {
httpGet.abort();
throw e;
} catch (final Exception e) {
httpGet.abort();
throw new CrawlingAccessException("Could not process " + robotTxtUrl + ". ", e);
} finally {
EntityUtils.consumeQuietly(httpEntity);
}
}
use of org.codelibs.fess.crawler.entity.RobotsTxt in project fess-crawler by codelibs.
the class RobotsTxtHelper method parse.
public RobotsTxt parse(final InputStream stream, final String charsetName) {
if (!enabled) {
return null;
}
try {
@SuppressWarnings("resource") final BufferedReader reader = new BufferedReader(new InputStreamReader(new BOMInputStream(stream), charsetName));
String line;
final RobotsTxt robotsTxt = new RobotsTxt();
final List<Directive> currentDirectiveList = new ArrayList<>();
boolean isGroupRecodeStarted = false;
while ((line = reader.readLine()) != null) {
line = stripComment(line).trim();
if (StringUtil.isEmpty(line)) {
continue;
}
String value;
if ((value = getValue(USER_AGENT_RECORD, line)) != null) {
if (isGroupRecodeStarted) {
currentDirectiveList.clear();
isGroupRecodeStarted = false;
}
final String userAgent = value.toLowerCase(Locale.ENGLISH);
Directive currentDirective = robotsTxt.getDirective(userAgent);
if (currentDirective == null) {
currentDirective = new Directive(userAgent);
robotsTxt.addDirective(currentDirective);
currentDirectiveList.add(currentDirective);
}
} else {
isGroupRecodeStarted = true;
if ((value = getValue(DISALLOW_RECORD, line)) != null) {
if (!currentDirectiveList.isEmpty() && value.length() > 0) {
for (final Directive directive : currentDirectiveList) {
directive.addDisallow(value);
}
}
} else if ((value = getValue(ALLOW_RECORD, line)) != null) {
if (!currentDirectiveList.isEmpty() && value.length() > 0) {
for (final Directive directive : currentDirectiveList) {
directive.addAllow(value);
}
}
} else if ((value = getValue(CRAWL_DELAY_RECORD, line)) != null) {
if (!currentDirectiveList.isEmpty()) {
try {
final int crawlDelay = Integer.parseInt(value);
for (final Directive directive : currentDirectiveList) {
directive.setCrawlDelay(Math.max(0, crawlDelay));
}
} catch (final NumberFormatException e) {
// ignore
}
}
} else if ((value = getValue(SITEMAP_RECORD, line)) != null) {
if (value.length() > 0) {
robotsTxt.addSitemap(value);
}
}
}
}
return robotsTxt;
} catch (final Exception e) {
throw new RobotsTxtException("Failed to parse robots.txt.", e);
}
}
use of org.codelibs.fess.crawler.entity.RobotsTxt in project fess-crawler by codelibs.
the class RobotsTxtHelperTest method testParse.
public void testParse() {
RobotsTxt robotsTxt;
final InputStream in = RobotsTxtHelperTest.class.getResourceAsStream("robots.txt");
try {
robotsTxt = robotsTxtHelper.parse(in);
} finally {
CloseableUtil.closeQuietly(in);
}
for (String userAgent : new String[] { "FessCrawler", "FessCrawler/1.0", "Mozilla FessCrawler" }) {
assertTrue(robotsTxt.allows("/aaa", userAgent));
assertTrue(robotsTxt.allows("/private/", userAgent));
assertTrue(robotsTxt.allows("/private/index.html", userAgent));
assertTrue(robotsTxt.allows("/help/", userAgent));
assertTrue(robotsTxt.allows("/help.html", userAgent));
assertTrue(robotsTxt.allows("/help/faq.html", userAgent));
assertTrue(robotsTxt.allows("/foo/bar/", userAgent));
assertTrue(robotsTxt.allows("/foo/bar/index.html", userAgent));
assertEquals(0, robotsTxt.getCrawlDelay(userAgent));
}
for (String userAgent : new String[] { "BruteBot", "FOO BruteBot/1.0" }) {
assertFalse(robotsTxt.allows("/aaa", userAgent));
assertFalse(robotsTxt.allows("/private/", userAgent));
assertFalse(robotsTxt.allows("/private/index.html", userAgent));
assertFalse(robotsTxt.allows("/help/", userAgent));
assertFalse(robotsTxt.allows("/help.html", userAgent));
assertFalse(robotsTxt.allows("/help/faq.html", userAgent));
assertTrue(robotsTxt.allows("/foo/bar/", userAgent));
assertTrue(robotsTxt.allows("/foo/bar/index.html", userAgent));
assertEquals(1314000, robotsTxt.getCrawlDelay(userAgent));
}
for (String userAgent : new String[] { "GOOGLEBOT", "GoogleBot", "googlebot" }) {
assertTrue(robotsTxt.allows("/aaa", userAgent));
assertTrue(robotsTxt.allows("/private/", userAgent));
assertTrue(robotsTxt.allows("/private/index.html", userAgent));
assertTrue(robotsTxt.allows("/help/", userAgent));
assertTrue(robotsTxt.allows("/help.html", userAgent));
assertTrue(robotsTxt.allows("/help/faq.html", userAgent));
assertTrue(robotsTxt.allows("/foo/bar/", userAgent));
assertTrue(robotsTxt.allows("/foo/bar/index.html", userAgent));
assertEquals(1, robotsTxt.getCrawlDelay(userAgent));
}
for (String userAgent : new String[] { "UnknownBot", "", " ", null }) {
assertTrue(robotsTxt.allows("/aaa", userAgent));
assertFalse(robotsTxt.allows("/private/", userAgent));
assertFalse(robotsTxt.allows("/private/index.html", userAgent));
assertFalse(robotsTxt.allows("/help/", userAgent));
assertFalse(robotsTxt.allows("/help.html", userAgent));
assertTrue(robotsTxt.allows("/help/faq.html", userAgent));
assertTrue(robotsTxt.allows("/foo/bar/", userAgent));
assertTrue(robotsTxt.allows("/foo/bar/index.html", userAgent));
assertEquals(3, robotsTxt.getCrawlDelay(userAgent));
}
assertFalse(robotsTxt.allows("/aaa", "Crawler"));
assertTrue(robotsTxt.allows("/bbb", "Crawler"));
assertTrue(robotsTxt.allows("/ccc", "Crawler"));
assertTrue(robotsTxt.allows("/ddd", "Crawler"));
assertTrue(robotsTxt.allows("/aaa", "Crawler/1.0"));
assertFalse(robotsTxt.allows("/bbb", "Crawler/1.0"));
assertTrue(robotsTxt.allows("/ccc", "Crawler/1.0"));
assertTrue(robotsTxt.allows("/ddd", "Crawler/1.0"));
assertTrue(robotsTxt.allows("/aaa", "Crawler/2.0"));
assertTrue(robotsTxt.allows("/bbb", "Crawler/2.0"));
assertFalse(robotsTxt.allows("/ccc", "Crawler/2.0"));
assertTrue(robotsTxt.allows("/ddd", "Crawler/2.0"));
assertTrue(robotsTxt.allows("/aaa", "Hoge Crawler"));
assertTrue(robotsTxt.allows("/bbb", "Hoge Crawler"));
assertTrue(robotsTxt.allows("/ccc", "Hoge Crawler"));
assertFalse(robotsTxt.allows("/ddd", "Hoge Crawler"));
String[] sitemaps = robotsTxt.getSitemaps();
assertEquals(2, sitemaps.length);
assertEquals("http://www.example.com/sitmap.xml", sitemaps[0]);
assertEquals("http://www.example.net/sitmap.xml", sitemaps[1]);
}
Aggregations