use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class FtpClient method getResponseData.
protected ResponseData getResponseData(final String uri, final boolean includeContent) {
final ResponseData responseData = new ResponseData();
FTPClient client = null;
try {
responseData.setMethod(Constants.GET_METHOD);
final FtpInfo ftpInfo = new FtpInfo(uri);
responseData.setUrl(ftpInfo.toUrl());
client = getClient(ftpInfo);
FTPFile file = null;
client.changeWorkingDirectory(ftpInfo.getParent());
validateRequest(client);
if (ftpInfo.getName() == null) {
// root directory
final Set<RequestData> requestDataSet = new HashSet<>();
if (includeContent) {
try {
final FTPFile[] files = client.listFiles(ftpInfo.getParent(), FTPFileFilters.NON_NULL);
validateRequest(client);
for (final FTPFile f : files) {
final String chileUri = ftpInfo.toChildUrl(f.getName());
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
}
} catch (final IOException e) {
disconnectInternalClient(client);
throw new CrawlingAccessException("Could not access " + uri, e);
}
}
ftpClientQueue.offer(client);
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
}
final FTPFile[] files = client.listFiles(null, FTPFileFilters.NON_NULL);
validateRequest(client);
for (final FTPFile f : files) {
if (ftpInfo.getName().equals(f.getName())) {
file = f;
break;
}
}
updateResponseData(uri, includeContent, responseData, client, ftpInfo, file);
} catch (final CrawlerSystemException e) {
CloseableUtil.closeQuietly(responseData);
throw e;
} catch (final Exception e) {
CloseableUtil.closeQuietly(responseData);
throw new CrawlingAccessException("Could not access " + uri, e);
}
return responseData;
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class HcHttpClient method processRobotsTxt.
protected void processRobotsTxt(final String url) {
if (StringUtil.isBlank(url)) {
throw new CrawlerSystemException("url is null or empty.");
}
if (robotsTxtHelper == null || !robotsTxtHelper.isEnabled()) {
// not support robots.txt
return;
}
// crawler context
final CrawlerContext crawlerContext = CrawlingParameterUtil.getCrawlerContext();
if (crawlerContext == null) {
// wrong state
return;
}
final int idx = url.indexOf('/', url.indexOf("://") + 3);
String hostUrl;
if (idx >= 0) {
hostUrl = url.substring(0, idx);
} else {
hostUrl = url;
}
final String robotTxtUrl = hostUrl + "/robots.txt";
// check url
if (crawlerContext.getRobotsTxtUrlSet().contains(robotTxtUrl)) {
if (logger.isDebugEnabled()) {
logger.debug(robotTxtUrl + " is already visited.");
}
return;
}
if (logger.isInfoEnabled()) {
logger.info("Checking URL: " + robotTxtUrl);
}
// add url to a set
crawlerContext.getRobotsTxtUrlSet().add(robotTxtUrl);
final HttpGet httpGet = new HttpGet(robotTxtUrl);
// request header
for (final Header header : requestHeaderList) {
httpGet.addHeader(header);
}
HttpEntity httpEntity = null;
try {
// get a content
final HttpResponse response = executeHttpClient(httpGet);
httpEntity = response.getEntity();
final int httpStatusCode = response.getStatusLine().getStatusCode();
if (httpStatusCode == 200) {
// check file size
final Header contentLengthHeader = response.getFirstHeader("Content-Length");
if (contentLengthHeader != null) {
final String value = contentLengthHeader.getValue();
final long contentLength = Long.parseLong(value);
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength("text/plain");
if (contentLength > maxLength) {
throw new MaxLengthExceededException("The content length (" + contentLength + " byte) is over " + maxLength + " byte. The url is " + robotTxtUrl);
}
}
}
if (httpEntity != null) {
final RobotsTxt robotsTxt = robotsTxtHelper.parse(httpEntity.getContent());
if (robotsTxt != null) {
final String[] sitemaps = robotsTxt.getSitemaps();
if (sitemaps.length > 0) {
crawlerContext.addSitemaps(sitemaps);
}
final RobotsTxt.Directive directive = robotsTxt.getMatchedDirective(userAgent);
if (directive != null) {
if (useRobotsTxtDisallows) {
for (String urlPattern : directive.getDisallows()) {
if (StringUtil.isNotBlank(urlPattern)) {
urlPattern = convertRobotsTxtPathPattern(urlPattern);
crawlerContext.getUrlFilter().addExclude(hostUrl + urlPattern);
}
}
}
if (useRobotsTxtAllows) {
for (String urlPattern : directive.getAllows()) {
if (StringUtil.isNotBlank(urlPattern)) {
urlPattern = convertRobotsTxtPathPattern(urlPattern);
crawlerContext.getUrlFilter().addInclude(hostUrl + urlPattern);
}
}
}
}
}
}
}
} catch (final CrawlerSystemException e) {
httpGet.abort();
throw e;
} catch (final Exception e) {
httpGet.abort();
throw new CrawlingAccessException("Could not process " + robotTxtUrl + ". ", e);
} finally {
EntityUtils.consumeQuietly(httpEntity);
}
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class HcHttpClient method processHttpMethod.
protected ResponseData processHttpMethod(final String url, final HttpUriRequest httpRequest) {
try {
processRobotsTxt(url);
} catch (final CrawlingAccessException e) {
if (logger.isInfoEnabled()) {
final StringBuilder buf = new StringBuilder(100);
buf.append(e.getMessage());
if (e.getCause() != null) {
buf.append(e.getCause().getMessage());
}
logger.info(buf.toString());
} else if (logger.isDebugEnabled()) {
logger.debug("Crawling Access Exception at " + url, e);
}
}
// request header
for (final Header header : requestHeaderList) {
httpRequest.addHeader(header);
}
ResponseData responseData = new ResponseData();
HttpEntity httpEntity = null;
try {
// get a content
final HttpResponse response = executeHttpClient(httpRequest);
httpEntity = response.getEntity();
final int httpStatusCode = response.getStatusLine().getStatusCode();
// redirect
if (isRedirectHttpStatus(httpStatusCode)) {
final Header locationHeader = response.getFirstHeader("location");
if (locationHeader == null) {
logger.warn("Invalid redirect location at " + url);
} else {
final String redirectLocation;
if (locationHeader.getValue().startsWith("/")) {
redirectLocation = buildRedirectLocation(url, locationHeader.getValue());
} else {
redirectLocation = locationHeader.getValue();
}
responseData = new ResponseData();
responseData.setRedirectLocation(redirectLocation);
return responseData;
}
}
String contentType = null;
final Header contentTypeHeader = response.getFirstHeader("Content-Type");
if (contentTypeHeader != null) {
contentType = contentTypeHeader.getValue();
final int idx = contentType.indexOf(';');
if (idx > 0) {
contentType = contentType.substring(0, idx);
if (APPLICATION_OCTET_STREAM.equals(contentType)) {
contentType = null;
}
}
}
long contentLength = 0;
String contentEncoding = Constants.UTF_8;
if (httpEntity == null) {
responseData.setResponseBody(new byte[0]);
if (contentType == null) {
contentType = defaultMimeType;
}
} else {
final InputStream responseBodyStream = httpEntity.getContent();
final File outputFile = File.createTempFile("crawler-HcHttpClient-", ".out");
DeferredFileOutputStream dfos = null;
try {
try {
dfos = new DeferredFileOutputStream((int) maxCachedContentSize, outputFile);
CopyUtil.copy(responseBodyStream, dfos);
dfos.flush();
} finally {
CloseableUtil.closeQuietly(dfos);
}
} catch (final Exception e) {
if (!outputFile.delete()) {
logger.warn("Could not delete " + outputFile.getAbsolutePath());
}
throw e;
}
if (dfos.isInMemory()) {
responseData.setResponseBody(dfos.getData());
contentLength = dfos.getData().length;
if (!outputFile.delete()) {
logger.warn("Could not delete " + outputFile.getAbsolutePath());
}
if (contentType == null) {
try (InputStream is = new ByteArrayInputStream(dfos.getData())) {
contentType = mimeTypeHelper.getContentType(is, url);
} catch (final Exception e) {
logger.debug("Failed to detect mime-type.", e);
contentType = defaultMimeType;
}
}
} else {
responseData.setResponseBody(outputFile, true);
contentLength = outputFile.length();
if (contentType == null) {
try (InputStream is = new FileInputStream(outputFile)) {
contentType = mimeTypeHelper.getContentType(is, url);
} catch (final Exception e) {
logger.debug("Failed to detect mime-type.", e);
contentType = defaultMimeType;
}
}
}
final Header contentEncodingHeader = httpEntity.getContentEncoding();
if (contentEncodingHeader != null) {
contentEncoding = contentEncodingHeader.getValue();
}
}
// check file size
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(contentType);
if (contentLength > maxLength) {
throw new MaxLengthExceededException("The content length (" + contentLength + " byte) is over " + maxLength + " byte. The url is " + url);
}
}
responseData.setUrl(url);
responseData.setCharSet(contentEncoding);
if (httpRequest instanceof HttpHead) {
responseData.setMethod(Constants.HEAD_METHOD);
} else {
responseData.setMethod(Constants.GET_METHOD);
}
responseData.setHttpStatusCode(httpStatusCode);
for (final Header header : response.getAllHeaders()) {
responseData.addMetaData(header.getName(), header.getValue());
}
responseData.setMimeType(contentType);
final Header contentLengthHeader = response.getFirstHeader("Content-Length");
if (contentLengthHeader == null) {
responseData.setContentLength(contentLength);
} else {
final String value = contentLengthHeader.getValue();
try {
responseData.setContentLength(Long.parseLong(value));
} catch (final Exception e) {
responseData.setContentLength(contentLength);
}
}
checkMaxContentLength(responseData);
final Header lastModifiedHeader = response.getFirstHeader("Last-Modified");
if (lastModifiedHeader != null) {
final String value = lastModifiedHeader.getValue();
if (StringUtil.isNotBlank(value)) {
final Date d = parseLastModified(value);
if (d != null) {
responseData.setLastModified(d);
}
}
}
return responseData;
} catch (final UnknownHostException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("Unknown host(" + e.getMessage() + "): " + url, e);
} catch (final NoRouteToHostException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("No route to host(" + e.getMessage() + "): " + url, e);
} catch (final ConnectException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("Connection time out(" + e.getMessage() + "): " + url, e);
} catch (final SocketException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("Socket exception(" + e.getMessage() + "): " + url, e);
} catch (final IOException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("I/O exception(" + e.getMessage() + "): " + url, e);
} catch (final CrawlerSystemException e) {
closeResources(httpRequest, responseData);
throw e;
} catch (final Exception e) {
closeResources(httpRequest, responseData);
throw new CrawlerSystemException("Failed to access " + url, e);
} finally {
EntityUtils.consumeQuietly(httpEntity);
}
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class SitemapsHelper method parse.
protected SitemapSet parse(final InputStream in, final boolean recursive) {
final BufferedInputStream bis = new BufferedInputStream(in);
bis.mark(preloadSize);
String preloadDate = StringUtil.EMPTY;
final byte[] bytes = new byte[preloadSize];
try {
if (bis.read(bytes) == -1) {
throw new CrawlingAccessException("No sitemaps data.");
}
preloadDate = new String(bytes, Constants.UTF_8);
if (preloadDate.indexOf("<urlset") >= 0) {
// XML Sitemaps
bis.reset();
return parseXmlSitemaps(bis);
} else if (preloadDate.indexOf("<sitemapindex") >= 0) {
// XML Sitemaps Index
bis.reset();
return parseXmlSitemapsIndex(bis);
} else if (preloadDate.startsWith("http://") || preloadDate.startsWith("https://")) {
// Text Sitemaps Index
bis.reset();
return parseTextSitemaps(bis);
} else {
// gz
bis.reset();
return parse(new GZIPInputStream(bis), false);
}
} catch (final CrawlingAccessException e) {
throw e;
} catch (final Exception e) {
throw new CrawlingAccessException("Could not parse Sitemaps: " + preloadDate, e);
}
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.
the class BaseThumbnailGenerator method process.
protected boolean process(final String id, final Predicate<ResponseData> consumer) {
return process(id, (configId, url) -> {
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig config = crawlingConfigHelper.getCrawlingConfig(configId);
if (config == null) {
throw new ThumbnailGenerationException("No CrawlingConfig: " + configId);
}
if (logger.isInfoEnabled()) {
logger.info("Generating Thumbnail: {}", url);
}
final CrawlerClientFactory crawlerClientFactory = config.initializeClientFactory(() -> ComponentUtil.getComponent(CrawlerClientFactory.class));
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new ThumbnailGenerationException("No CrawlerClient: " + configId + ", url: " + url);
}
String u = url;
for (int i = 0; i < maxRedirectCount; i++) {
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(u).build())) {
if (StringUtil.isNotBlank(responseData.getRedirectLocation())) {
u = responseData.getRedirectLocation();
continue;
}
if (StringUtil.isBlank(responseData.getUrl())) {
throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url + " (Response URL is empty)");
}
return consumer.test(responseData);
} catch (final CrawlingAccessException e) {
if (logger.isDebugEnabled()) {
throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url, e);
}
throw new ThumbnailGenerationException(e.getMessage());
} catch (final Exception e) {
throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url, e);
}
}
throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url + " (Redirect Loop)");
});
}
Aggregations