use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.
the class AbstractCrawlerClientTest method test_checkMaxContentLength_1m.
public void test_checkMaxContentLength_1m() {
AbstractCrawlerClient client = new AbstractCrawlerClient() {
};
ResponseData responseData = new ResponseData();
responseData.setUrl("http://test.com/");
client.setMaxContentLength(1000000L);
client.checkMaxContentLength(responseData);
responseData.setContentLength(-1);
client.checkMaxContentLength(responseData);
responseData.setContentLength(1000L);
client.checkMaxContentLength(responseData);
responseData.setContentLength(1000000L);
client.checkMaxContentLength(responseData);
responseData.setContentLength(1000001L);
try {
client.checkMaxContentLength(responseData);
fail();
} catch (MaxLengthExceededException e) {
// ok
}
responseData.setContentLength(1000000000L);
try {
client.checkMaxContentLength(responseData);
fail();
} catch (MaxLengthExceededException e) {
// ok
}
responseData.setContentLength(1000000000000L);
try {
client.checkMaxContentLength(responseData);
fail();
} catch (MaxLengthExceededException e) {
// ok
}
}
use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.
the class TarExtractorTest method test_getText_maxSize.
public void test_getText_maxSize() throws IOException {
try (final InputStream in = ResourceUtil.getResourceAsStream("extractor/tar/test.tar")) {
tarExtractor.setMaxContentSize(100);
tarExtractor.getText(in, null);
fail();
} catch (MaxLengthExceededException e) {
// pass
}
tarExtractor.setMaxContentSize(-1);
}
use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.
the class ZipExtractorTest method test_getText_maxSize.
public void test_getText_maxSize() throws IOException {
try (final InputStream in = ResourceUtil.getResourceAsStream("extractor/zip/test.zip")) {
zipExtractor.setMaxContentSize(100);
zipExtractor.getText(in, null);
fail();
} catch (MaxLengthExceededException e) {
// pass
}
zipExtractor.setMaxContentSize(-1);
}
use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.
the class FileSystemClient method getResponseData.
protected ResponseData getResponseData(final String uri, final boolean includeContent) {
final ResponseData responseData = new ResponseData();
try {
responseData.setMethod(Constants.GET_METHOD);
final String filePath = preprocessUri(uri);
responseData.setUrl(filePath);
File file = null;
try {
file = new File(new URI(filePath));
} catch (final URISyntaxException e) {
logger.warn("Could not parse url: " + filePath, e);
}
if (file == null) {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
} else if (file.isFile()) {
// check file size
responseData.setContentLength(file.length());
checkMaxContentLength(responseData);
try {
final FileOwnerAttributeView ownerAttrView = Files.getFileAttributeView(file.toPath(), FileOwnerAttributeView.class);
if (ownerAttrView != null) {
UserPrincipal owner = ownerAttrView.getOwner();
if (owner != null) {
responseData.addMetaData(FS_FILE_USER, owner.getName());
}
}
} catch (Exception e) {
logger.warn("Failed to parse FileOwnerAttributeView.", e);
}
try {
final AclFileAttributeView aclView = Files.getFileAttributeView(file.toPath(), AclFileAttributeView.class);
if (aclView != null) {
responseData.addMetaData(FILE_ATTRIBUTE_VIEW, aclView);
responseData.addMetaData(FS_FILE_GROUPS, aclView.getAcl().stream().map(acl -> acl.principal().getName()).toArray(n -> new String[n]));
}
} catch (Exception e) {
logger.warn("Failed to parse AclFileAttributeView.", e);
}
try {
final PosixFileAttributeView posixView = Files.getFileAttributeView(file.toPath(), PosixFileAttributeView.class);
if (posixView != null) {
responseData.addMetaData(FILE_ATTRIBUTE_VIEW, posixView);
responseData.addMetaData(FS_FILE_GROUPS, new String[] { posixView.readAttributes().group().getName() });
}
} catch (Exception e) {
logger.warn("Failed to parse PosixFileAttributeView.", e);
}
responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
responseData.setCharSet(geCharSet(file));
responseData.setLastModified(new Date(file.lastModified()));
if (file.canRead()) {
final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
try (final InputStream is = new BufferedInputStream(new FileInputStream(file))) {
responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
} catch (final Exception e) {
responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
}
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
if (responseData.getContentLength() > maxLength) {
throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + filePath);
}
}
if (includeContent) {
if (file.length() < maxCachedContentSize) {
try (InputStream contentStream = new BufferedInputStream(new FileInputStream(file))) {
responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
} catch (final Exception e) {
logger.warn("I/O Exception.", e);
responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
}
} else {
responseData.setResponseBody(file, false);
}
}
} else {
// Forbidden
responseData.setHttpStatusCode(Constants.FORBIDDEN_STATUS_CODE);
responseData.setMimeType(APPLICATION_OCTET_STREAM);
}
} else if (file.isDirectory()) {
final Set<RequestData> requestDataSet = new HashSet<>();
if (includeContent) {
final File[] files = file.listFiles();
if (files != null) {
for (final File f : files) {
final String chileUri = f.toURI().toASCIIString();
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
}
}
}
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
} else {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
}
} catch (final CrawlerSystemException e) {
CloseableUtil.closeQuietly(responseData);
throw e;
} catch (final Exception e) {
CloseableUtil.closeQuietly(responseData);
throw new CrawlingAccessException("Could not access " + uri, e);
}
return responseData;
}
use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.
the class HcHttpClient method processRobotsTxt.
protected void processRobotsTxt(final String url) {
if (StringUtil.isBlank(url)) {
throw new CrawlerSystemException("url is null or empty.");
}
if (robotsTxtHelper == null || !robotsTxtHelper.isEnabled()) {
// not support robots.txt
return;
}
// crawler context
final CrawlerContext crawlerContext = CrawlingParameterUtil.getCrawlerContext();
if (crawlerContext == null) {
// wrong state
return;
}
final int idx = url.indexOf('/', url.indexOf("://") + 3);
String hostUrl;
if (idx >= 0) {
hostUrl = url.substring(0, idx);
} else {
hostUrl = url;
}
final String robotTxtUrl = hostUrl + "/robots.txt";
// check url
if (crawlerContext.getRobotsTxtUrlSet().contains(robotTxtUrl)) {
if (logger.isDebugEnabled()) {
logger.debug(robotTxtUrl + " is already visited.");
}
return;
}
if (logger.isInfoEnabled()) {
logger.info("Checking URL: " + robotTxtUrl);
}
// add url to a set
crawlerContext.getRobotsTxtUrlSet().add(robotTxtUrl);
final HttpGet httpGet = new HttpGet(robotTxtUrl);
// request header
for (final Header header : requestHeaderList) {
httpGet.addHeader(header);
}
HttpEntity httpEntity = null;
try {
// get a content
final HttpResponse response = executeHttpClient(httpGet);
httpEntity = response.getEntity();
final int httpStatusCode = response.getStatusLine().getStatusCode();
if (httpStatusCode == 200) {
// check file size
final Header contentLengthHeader = response.getFirstHeader("Content-Length");
if (contentLengthHeader != null) {
final String value = contentLengthHeader.getValue();
final long contentLength = Long.parseLong(value);
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength("text/plain");
if (contentLength > maxLength) {
throw new MaxLengthExceededException("The content length (" + contentLength + " byte) is over " + maxLength + " byte. The url is " + robotTxtUrl);
}
}
}
if (httpEntity != null) {
final RobotsTxt robotsTxt = robotsTxtHelper.parse(httpEntity.getContent());
if (robotsTxt != null) {
final String[] sitemaps = robotsTxt.getSitemaps();
if (sitemaps.length > 0) {
crawlerContext.addSitemaps(sitemaps);
}
final RobotsTxt.Directive directive = robotsTxt.getMatchedDirective(userAgent);
if (directive != null) {
if (useRobotsTxtDisallows) {
for (String urlPattern : directive.getDisallows()) {
if (StringUtil.isNotBlank(urlPattern)) {
urlPattern = convertRobotsTxtPathPattern(urlPattern);
crawlerContext.getUrlFilter().addExclude(hostUrl + urlPattern);
}
}
}
if (useRobotsTxtAllows) {
for (String urlPattern : directive.getAllows()) {
if (StringUtil.isNotBlank(urlPattern)) {
urlPattern = convertRobotsTxtPathPattern(urlPattern);
crawlerContext.getUrlFilter().addInclude(hostUrl + urlPattern);
}
}
}
}
}
}
}
} catch (final CrawlerSystemException e) {
httpGet.abort();
throw e;
} catch (final Exception e) {
httpGet.abort();
throw new CrawlingAccessException("Could not process " + robotTxtUrl + ". ", e);
} finally {
EntityUtils.consumeQuietly(httpEntity);
}
}
Aggregations