use of org.codelibs.fess.crawler.exception.RobotsTxtException in project fess-crawler by codelibs.
the class RobotsTxtHelper method parse.
public RobotsTxt parse(final InputStream stream, final String charsetName) {
if (!enabled) {
return null;
}
try {
@SuppressWarnings("resource") final BufferedReader reader = new BufferedReader(new InputStreamReader(new BOMInputStream(stream), charsetName));
String line;
final RobotsTxt robotsTxt = new RobotsTxt();
final List<Directive> currentDirectiveList = new ArrayList<>();
boolean isGroupRecodeStarted = false;
while ((line = reader.readLine()) != null) {
line = stripComment(line).trim();
if (StringUtil.isEmpty(line)) {
continue;
}
String value;
if ((value = getValue(USER_AGENT_RECORD, line)) != null) {
if (isGroupRecodeStarted) {
currentDirectiveList.clear();
isGroupRecodeStarted = false;
}
final String userAgent = value.toLowerCase(Locale.ENGLISH);
Directive currentDirective = robotsTxt.getDirective(userAgent);
if (currentDirective == null) {
currentDirective = new Directive(userAgent);
robotsTxt.addDirective(currentDirective);
currentDirectiveList.add(currentDirective);
}
} else {
isGroupRecodeStarted = true;
if ((value = getValue(DISALLOW_RECORD, line)) != null) {
if (!currentDirectiveList.isEmpty() && value.length() > 0) {
for (final Directive directive : currentDirectiveList) {
directive.addDisallow(value);
}
}
} else if ((value = getValue(ALLOW_RECORD, line)) != null) {
if (!currentDirectiveList.isEmpty() && value.length() > 0) {
for (final Directive directive : currentDirectiveList) {
directive.addAllow(value);
}
}
} else if ((value = getValue(CRAWL_DELAY_RECORD, line)) != null) {
if (!currentDirectiveList.isEmpty()) {
try {
final int crawlDelay = Integer.parseInt(value);
for (final Directive directive : currentDirectiveList) {
directive.setCrawlDelay(Math.max(0, crawlDelay));
}
} catch (final NumberFormatException e) {
// ignore
}
}
} else if ((value = getValue(SITEMAP_RECORD, line)) != null) {
if (value.length() > 0) {
robotsTxt.addSitemap(value);
}
}
}
}
return robotsTxt;
} catch (final Exception e) {
throw new RobotsTxtException("Failed to parse robots.txt.", e);
}
}
Aggregations