use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class FileSystemClient method getResponseData.
protected ResponseData getResponseData(final String uri, final boolean includeContent) {
final ResponseData responseData = new ResponseData();
try {
responseData.setMethod(Constants.GET_METHOD);
final String filePath = preprocessUri(uri);
responseData.setUrl(filePath);
File file = null;
try {
file = new File(new URI(filePath));
} catch (final URISyntaxException e) {
logger.warn("Could not parse url: " + filePath, e);
}
if (file == null) {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
} else if (file.isFile()) {
// check file size
responseData.setContentLength(file.length());
checkMaxContentLength(responseData);
try {
final FileOwnerAttributeView ownerAttrView = Files.getFileAttributeView(file.toPath(), FileOwnerAttributeView.class);
if (ownerAttrView != null) {
UserPrincipal owner = ownerAttrView.getOwner();
if (owner != null) {
responseData.addMetaData(FS_FILE_USER, owner.getName());
}
}
} catch (Exception e) {
logger.warn("Failed to parse FileOwnerAttributeView.", e);
}
try {
final AclFileAttributeView aclView = Files.getFileAttributeView(file.toPath(), AclFileAttributeView.class);
if (aclView != null) {
responseData.addMetaData(FILE_ATTRIBUTE_VIEW, aclView);
responseData.addMetaData(FS_FILE_GROUPS, aclView.getAcl().stream().map(acl -> acl.principal().getName()).toArray(n -> new String[n]));
}
} catch (Exception e) {
logger.warn("Failed to parse AclFileAttributeView.", e);
}
try {
final PosixFileAttributeView posixView = Files.getFileAttributeView(file.toPath(), PosixFileAttributeView.class);
if (posixView != null) {
responseData.addMetaData(FILE_ATTRIBUTE_VIEW, posixView);
responseData.addMetaData(FS_FILE_GROUPS, new String[] { posixView.readAttributes().group().getName() });
}
} catch (Exception e) {
logger.warn("Failed to parse PosixFileAttributeView.", e);
}
responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
responseData.setCharSet(geCharSet(file));
responseData.setLastModified(new Date(file.lastModified()));
if (file.canRead()) {
final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
try (final InputStream is = new BufferedInputStream(new FileInputStream(file))) {
responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
} catch (final Exception e) {
responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
}
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
if (responseData.getContentLength() > maxLength) {
throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + filePath);
}
}
if (includeContent) {
if (file.length() < maxCachedContentSize) {
try (InputStream contentStream = new BufferedInputStream(new FileInputStream(file))) {
responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
} catch (final Exception e) {
logger.warn("I/O Exception.", e);
responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
}
} else {
responseData.setResponseBody(file, false);
}
}
} else {
// Forbidden
responseData.setHttpStatusCode(Constants.FORBIDDEN_STATUS_CODE);
responseData.setMimeType(APPLICATION_OCTET_STREAM);
}
} else if (file.isDirectory()) {
final Set<RequestData> requestDataSet = new HashSet<>();
if (includeContent) {
final File[] files = file.listFiles();
if (files != null) {
for (final File f : files) {
final String chileUri = f.toURI().toASCIIString();
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
}
}
}
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
} else {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
}
} catch (final CrawlerSystemException e) {
CloseableUtil.closeQuietly(responseData);
throw e;
} catch (final Exception e) {
CloseableUtil.closeQuietly(responseData);
throw new CrawlingAccessException("Could not access " + uri, e);
}
return responseData;
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class FtpClient method getResponseData.
protected ResponseData getResponseData(final String uri, final boolean includeContent) {
final ResponseData responseData = new ResponseData();
FTPClient client = null;
try {
responseData.setMethod(Constants.GET_METHOD);
final FtpInfo ftpInfo = new FtpInfo(uri);
responseData.setUrl(ftpInfo.toUrl());
client = getClient(ftpInfo);
FTPFile file = null;
client.changeWorkingDirectory(ftpInfo.getParent());
validateRequest(client);
if (ftpInfo.getName() == null) {
// root directory
final Set<RequestData> requestDataSet = new HashSet<>();
if (includeContent) {
try {
final FTPFile[] files = client.listFiles(ftpInfo.getParent(), FTPFileFilters.NON_NULL);
validateRequest(client);
for (final FTPFile f : files) {
final String chileUri = ftpInfo.toChildUrl(f.getName());
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
}
} catch (final IOException e) {
disconnectInternalClient(client);
throw new CrawlingAccessException("Could not access " + uri, e);
}
}
ftpClientQueue.offer(client);
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
}
final FTPFile[] files = client.listFiles(null, FTPFileFilters.NON_NULL);
validateRequest(client);
for (final FTPFile f : files) {
if (ftpInfo.getName().equals(f.getName())) {
file = f;
break;
}
}
updateResponseData(uri, includeContent, responseData, client, ftpInfo, file);
} catch (final CrawlerSystemException e) {
CloseableUtil.closeQuietly(responseData);
throw e;
} catch (final Exception e) {
CloseableUtil.closeQuietly(responseData);
throw new CrawlingAccessException("Could not access " + uri, e);
}
return responseData;
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class HcHttpClient method processRobotsTxt.
protected void processRobotsTxt(final String url) {
if (StringUtil.isBlank(url)) {
throw new CrawlerSystemException("url is null or empty.");
}
if (robotsTxtHelper == null || !robotsTxtHelper.isEnabled()) {
// not support robots.txt
return;
}
// crawler context
final CrawlerContext crawlerContext = CrawlingParameterUtil.getCrawlerContext();
if (crawlerContext == null) {
// wrong state
return;
}
final int idx = url.indexOf('/', url.indexOf("://") + 3);
String hostUrl;
if (idx >= 0) {
hostUrl = url.substring(0, idx);
} else {
hostUrl = url;
}
final String robotTxtUrl = hostUrl + "/robots.txt";
// check url
if (crawlerContext.getRobotsTxtUrlSet().contains(robotTxtUrl)) {
if (logger.isDebugEnabled()) {
logger.debug(robotTxtUrl + " is already visited.");
}
return;
}
if (logger.isInfoEnabled()) {
logger.info("Checking URL: " + robotTxtUrl);
}
// add url to a set
crawlerContext.getRobotsTxtUrlSet().add(robotTxtUrl);
final HttpGet httpGet = new HttpGet(robotTxtUrl);
// request header
for (final Header header : requestHeaderList) {
httpGet.addHeader(header);
}
HttpEntity httpEntity = null;
try {
// get a content
final HttpResponse response = executeHttpClient(httpGet);
httpEntity = response.getEntity();
final int httpStatusCode = response.getStatusLine().getStatusCode();
if (httpStatusCode == 200) {
// check file size
final Header contentLengthHeader = response.getFirstHeader("Content-Length");
if (contentLengthHeader != null) {
final String value = contentLengthHeader.getValue();
final long contentLength = Long.parseLong(value);
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength("text/plain");
if (contentLength > maxLength) {
throw new MaxLengthExceededException("The content length (" + contentLength + " byte) is over " + maxLength + " byte. The url is " + robotTxtUrl);
}
}
}
if (httpEntity != null) {
final RobotsTxt robotsTxt = robotsTxtHelper.parse(httpEntity.getContent());
if (robotsTxt != null) {
final String[] sitemaps = robotsTxt.getSitemaps();
if (sitemaps.length > 0) {
crawlerContext.addSitemaps(sitemaps);
}
final RobotsTxt.Directive directive = robotsTxt.getMatchedDirective(userAgent);
if (directive != null) {
if (useRobotsTxtDisallows) {
for (String urlPattern : directive.getDisallows()) {
if (StringUtil.isNotBlank(urlPattern)) {
urlPattern = convertRobotsTxtPathPattern(urlPattern);
crawlerContext.getUrlFilter().addExclude(hostUrl + urlPattern);
}
}
}
if (useRobotsTxtAllows) {
for (String urlPattern : directive.getAllows()) {
if (StringUtil.isNotBlank(urlPattern)) {
urlPattern = convertRobotsTxtPathPattern(urlPattern);
crawlerContext.getUrlFilter().addInclude(hostUrl + urlPattern);
}
}
}
}
}
}
}
} catch (final CrawlerSystemException e) {
httpGet.abort();
throw e;
} catch (final Exception e) {
httpGet.abort();
throw new CrawlingAccessException("Could not process " + robotTxtUrl + ". ", e);
} finally {
EntityUtils.consumeQuietly(httpEntity);
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class HcHttpClient method processHttpMethod.
protected ResponseData processHttpMethod(final String url, final HttpUriRequest httpRequest) {
try {
processRobotsTxt(url);
} catch (final CrawlingAccessException e) {
if (logger.isInfoEnabled()) {
final StringBuilder buf = new StringBuilder(100);
buf.append(e.getMessage());
if (e.getCause() != null) {
buf.append(e.getCause().getMessage());
}
logger.info(buf.toString());
} else if (logger.isDebugEnabled()) {
logger.debug("Crawling Access Exception at " + url, e);
}
}
// request header
for (final Header header : requestHeaderList) {
httpRequest.addHeader(header);
}
ResponseData responseData = new ResponseData();
HttpEntity httpEntity = null;
try {
// get a content
final HttpResponse response = executeHttpClient(httpRequest);
httpEntity = response.getEntity();
final int httpStatusCode = response.getStatusLine().getStatusCode();
// redirect
if (isRedirectHttpStatus(httpStatusCode)) {
final Header locationHeader = response.getFirstHeader("location");
if (locationHeader == null) {
logger.warn("Invalid redirect location at " + url);
} else {
final String redirectLocation;
if (locationHeader.getValue().startsWith("/")) {
redirectLocation = buildRedirectLocation(url, locationHeader.getValue());
} else {
redirectLocation = locationHeader.getValue();
}
responseData = new ResponseData();
responseData.setRedirectLocation(redirectLocation);
return responseData;
}
}
String contentType = null;
final Header contentTypeHeader = response.getFirstHeader("Content-Type");
if (contentTypeHeader != null) {
contentType = contentTypeHeader.getValue();
final int idx = contentType.indexOf(';');
if (idx > 0) {
contentType = contentType.substring(0, idx);
if (APPLICATION_OCTET_STREAM.equals(contentType)) {
contentType = null;
}
}
}
long contentLength = 0;
String contentEncoding = Constants.UTF_8;
if (httpEntity == null) {
responseData.setResponseBody(new byte[0]);
if (contentType == null) {
contentType = defaultMimeType;
}
} else {
final InputStream responseBodyStream = httpEntity.getContent();
final File outputFile = File.createTempFile("crawler-HcHttpClient-", ".out");
DeferredFileOutputStream dfos = null;
try {
try {
dfos = new DeferredFileOutputStream((int) maxCachedContentSize, outputFile);
CopyUtil.copy(responseBodyStream, dfos);
dfos.flush();
} finally {
CloseableUtil.closeQuietly(dfos);
}
} catch (final Exception e) {
if (!outputFile.delete()) {
logger.warn("Could not delete " + outputFile.getAbsolutePath());
}
throw e;
}
if (dfos.isInMemory()) {
responseData.setResponseBody(dfos.getData());
contentLength = dfos.getData().length;
if (!outputFile.delete()) {
logger.warn("Could not delete " + outputFile.getAbsolutePath());
}
if (contentType == null) {
try (InputStream is = new ByteArrayInputStream(dfos.getData())) {
contentType = mimeTypeHelper.getContentType(is, url);
} catch (final Exception e) {
logger.debug("Failed to detect mime-type.", e);
contentType = defaultMimeType;
}
}
} else {
responseData.setResponseBody(outputFile, true);
contentLength = outputFile.length();
if (contentType == null) {
try (InputStream is = new FileInputStream(outputFile)) {
contentType = mimeTypeHelper.getContentType(is, url);
} catch (final Exception e) {
logger.debug("Failed to detect mime-type.", e);
contentType = defaultMimeType;
}
}
}
final Header contentEncodingHeader = httpEntity.getContentEncoding();
if (contentEncodingHeader != null) {
contentEncoding = contentEncodingHeader.getValue();
}
}
// check file size
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(contentType);
if (contentLength > maxLength) {
throw new MaxLengthExceededException("The content length (" + contentLength + " byte) is over " + maxLength + " byte. The url is " + url);
}
}
responseData.setUrl(url);
responseData.setCharSet(contentEncoding);
if (httpRequest instanceof HttpHead) {
responseData.setMethod(Constants.HEAD_METHOD);
} else {
responseData.setMethod(Constants.GET_METHOD);
}
responseData.setHttpStatusCode(httpStatusCode);
for (final Header header : response.getAllHeaders()) {
responseData.addMetaData(header.getName(), header.getValue());
}
responseData.setMimeType(contentType);
final Header contentLengthHeader = response.getFirstHeader("Content-Length");
if (contentLengthHeader == null) {
responseData.setContentLength(contentLength);
} else {
final String value = contentLengthHeader.getValue();
try {
responseData.setContentLength(Long.parseLong(value));
} catch (final Exception e) {
responseData.setContentLength(contentLength);
}
}
checkMaxContentLength(responseData);
final Header lastModifiedHeader = response.getFirstHeader("Last-Modified");
if (lastModifiedHeader != null) {
final String value = lastModifiedHeader.getValue();
if (StringUtil.isNotBlank(value)) {
final Date d = parseLastModified(value);
if (d != null) {
responseData.setLastModified(d);
}
}
}
return responseData;
} catch (final UnknownHostException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("Unknown host(" + e.getMessage() + "): " + url, e);
} catch (final NoRouteToHostException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("No route to host(" + e.getMessage() + "): " + url, e);
} catch (final ConnectException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("Connection time out(" + e.getMessage() + "): " + url, e);
} catch (final SocketException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("Socket exception(" + e.getMessage() + "): " + url, e);
} catch (final IOException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("I/O exception(" + e.getMessage() + "): " + url, e);
} catch (final CrawlerSystemException e) {
closeResources(httpRequest, responseData);
throw e;
} catch (final Exception e) {
closeResources(httpRequest, responseData);
throw new CrawlerSystemException("Failed to access " + url, e);
} finally {
EntityUtils.consumeQuietly(httpEntity);
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class HtmlXpathExtractor method getText.
/*
* (non-Javadoc)
*
* @see org.codelibs.fess.crawler.extractor.Extractor#getText(java.io.InputStream,
* java.util.Map)
*/
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
try {
final BufferedInputStream bis = new BufferedInputStream(in);
final String enc = getEncoding(bis);
final DOMParser parser = getDomParser();
final InputSource inputSource = new InputSource(bis);
inputSource.setEncoding(enc);
parser.parse(inputSource);
final Document document = parser.getDocument();
final StringBuilder buf = new StringBuilder(255);
final NodeList nodeList = getXPathAPI().selectNodeList(document, targetNodePath);
for (int i = 0; i < nodeList.getLength(); i++) {
final Node node = nodeList.item(i);
buf.append(node.getTextContent()).append(' ');
}
return new ExtractData(buf.toString().replaceAll("\\s+", " ").trim());
} catch (final Exception e) {
throw new ExtractException(e);
}
}
Aggregations