use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.
the class HcHttpClient method processHttpMethod.
protected ResponseData processHttpMethod(final String url, final HttpUriRequest httpRequest) {
try {
processRobotsTxt(url);
} catch (final CrawlingAccessException e) {
if (logger.isInfoEnabled()) {
final StringBuilder buf = new StringBuilder(100);
buf.append(e.getMessage());
if (e.getCause() != null) {
buf.append(e.getCause().getMessage());
}
logger.info(buf.toString());
} else if (logger.isDebugEnabled()) {
logger.debug("Crawling Access Exception at " + url, e);
}
}
// request header
for (final Header header : requestHeaderList) {
httpRequest.addHeader(header);
}
ResponseData responseData = new ResponseData();
HttpEntity httpEntity = null;
try {
// get a content
final HttpResponse response = executeHttpClient(httpRequest);
httpEntity = response.getEntity();
final int httpStatusCode = response.getStatusLine().getStatusCode();
// redirect
if (isRedirectHttpStatus(httpStatusCode)) {
final Header locationHeader = response.getFirstHeader("location");
if (locationHeader == null) {
logger.warn("Invalid redirect location at " + url);
} else {
final String redirectLocation;
if (locationHeader.getValue().startsWith("/")) {
redirectLocation = buildRedirectLocation(url, locationHeader.getValue());
} else {
redirectLocation = locationHeader.getValue();
}
responseData = new ResponseData();
responseData.setRedirectLocation(redirectLocation);
return responseData;
}
}
String contentType = null;
final Header contentTypeHeader = response.getFirstHeader("Content-Type");
if (contentTypeHeader != null) {
contentType = contentTypeHeader.getValue();
final int idx = contentType.indexOf(';');
if (idx > 0) {
contentType = contentType.substring(0, idx);
if (APPLICATION_OCTET_STREAM.equals(contentType)) {
contentType = null;
}
}
}
long contentLength = 0;
String contentEncoding = Constants.UTF_8;
if (httpEntity == null) {
responseData.setResponseBody(new byte[0]);
if (contentType == null) {
contentType = defaultMimeType;
}
} else {
final InputStream responseBodyStream = httpEntity.getContent();
final File outputFile = File.createTempFile("crawler-HcHttpClient-", ".out");
DeferredFileOutputStream dfos = null;
try {
try {
dfos = new DeferredFileOutputStream((int) maxCachedContentSize, outputFile);
CopyUtil.copy(responseBodyStream, dfos);
dfos.flush();
} finally {
CloseableUtil.closeQuietly(dfos);
}
} catch (final Exception e) {
if (!outputFile.delete()) {
logger.warn("Could not delete " + outputFile.getAbsolutePath());
}
throw e;
}
if (dfos.isInMemory()) {
responseData.setResponseBody(dfos.getData());
contentLength = dfos.getData().length;
if (!outputFile.delete()) {
logger.warn("Could not delete " + outputFile.getAbsolutePath());
}
if (contentType == null) {
try (InputStream is = new ByteArrayInputStream(dfos.getData())) {
contentType = mimeTypeHelper.getContentType(is, url);
} catch (final Exception e) {
logger.debug("Failed to detect mime-type.", e);
contentType = defaultMimeType;
}
}
} else {
responseData.setResponseBody(outputFile, true);
contentLength = outputFile.length();
if (contentType == null) {
try (InputStream is = new FileInputStream(outputFile)) {
contentType = mimeTypeHelper.getContentType(is, url);
} catch (final Exception e) {
logger.debug("Failed to detect mime-type.", e);
contentType = defaultMimeType;
}
}
}
final Header contentEncodingHeader = httpEntity.getContentEncoding();
if (contentEncodingHeader != null) {
contentEncoding = contentEncodingHeader.getValue();
}
}
// check file size
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(contentType);
if (contentLength > maxLength) {
throw new MaxLengthExceededException("The content length (" + contentLength + " byte) is over " + maxLength + " byte. The url is " + url);
}
}
responseData.setUrl(url);
responseData.setCharSet(contentEncoding);
if (httpRequest instanceof HttpHead) {
responseData.setMethod(Constants.HEAD_METHOD);
} else {
responseData.setMethod(Constants.GET_METHOD);
}
responseData.setHttpStatusCode(httpStatusCode);
for (final Header header : response.getAllHeaders()) {
responseData.addMetaData(header.getName(), header.getValue());
}
responseData.setMimeType(contentType);
final Header contentLengthHeader = response.getFirstHeader("Content-Length");
if (contentLengthHeader == null) {
responseData.setContentLength(contentLength);
} else {
final String value = contentLengthHeader.getValue();
try {
responseData.setContentLength(Long.parseLong(value));
} catch (final Exception e) {
responseData.setContentLength(contentLength);
}
}
checkMaxContentLength(responseData);
final Header lastModifiedHeader = response.getFirstHeader("Last-Modified");
if (lastModifiedHeader != null) {
final String value = lastModifiedHeader.getValue();
if (StringUtil.isNotBlank(value)) {
final Date d = parseLastModified(value);
if (d != null) {
responseData.setLastModified(d);
}
}
}
return responseData;
} catch (final UnknownHostException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("Unknown host(" + e.getMessage() + "): " + url, e);
} catch (final NoRouteToHostException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("No route to host(" + e.getMessage() + "): " + url, e);
} catch (final ConnectException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("Connection time out(" + e.getMessage() + "): " + url, e);
} catch (final SocketException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("Socket exception(" + e.getMessage() + "): " + url, e);
} catch (final IOException e) {
closeResources(httpRequest, responseData);
throw new CrawlingAccessException("I/O exception(" + e.getMessage() + "): " + url, e);
} catch (final CrawlerSystemException e) {
closeResources(httpRequest, responseData);
throw e;
} catch (final Exception e) {
closeResources(httpRequest, responseData);
throw new CrawlerSystemException("Failed to access " + url, e);
} finally {
EntityUtils.consumeQuietly(httpEntity);
}
}
use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.
the class TarExtractor method getTextInternal.
protected String getTextInternal(final InputStream in, final MimeTypeHelper mimeTypeHelper, final ExtractorFactory extractorFactory) {
final StringBuilder buf = new StringBuilder(1000);
ArchiveInputStream ais = null;
try {
ais = archiveStreamFactory.createArchiveInputStream("tar", in);
TarArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (TarArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
}
} finally {
CloseableUtil.closeQuietly(ais);
}
return buf.toString().trim();
}
use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.
the class ZipExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) {
ZipArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
}
}
return new ExtractData(buf.toString().trim());
}
use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.
the class FtpClient method updateResponseData.
protected void updateResponseData(final String uri, final boolean includeContent, final ResponseData responseData, FTPClient client, final FtpInfo ftpInfo, FTPFile file) {
if (file == null) {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
return;
}
if (file.isSymbolicLink()) {
final String link = file.getLink();
String redirect = null;
if (link == null) {
responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
return;
} else if (link.startsWith("/")) {
redirect = ftpInfo.toUrl(file.getLink());
} else if (link.startsWith("../")) {
redirect = ftpInfo.toChildUrl(file.getLink());
} else {
redirect = ftpInfo.toChildUrl("../" + file.getLink());
}
if (!uri.equals(redirect)) {
responseData.setHttpStatusCode(Constants.OK_STATUS);
responseData.setCharSet(charset);
responseData.setContentLength(0);
responseData.setRedirectLocation(redirect);
ftpClientQueue.offer(client);
return;
}
}
if (file.isFile()) {
responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
responseData.setCharSet(Constants.UTF_8);
responseData.setLastModified(file.getTimestamp().getTime());
// check file size
responseData.setContentLength(file.getSize());
checkMaxContentLength(responseData);
if (file.getUser() != null) {
responseData.addMetaData(FTP_FILE_USER, file.getUser());
}
if (file.getGroup() != null) {
responseData.addMetaData(FTP_FILE_GROUP, file.getGroup());
}
if (includeContent) {
File tempFile = null;
File outputFile = null;
try {
tempFile = File.createTempFile("ftp-", ".tmp");
try (OutputStream out = new BufferedOutputStream(new FileOutputStream(tempFile))) {
if (!client.retrieveFile(ftpInfo.getName(), out)) {
throw new CrawlingAccessException("Failed to retrieve: " + ftpInfo.toUrl());
}
}
final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
try (InputStream is = new FileInputStream(tempFile)) {
responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
} catch (final Exception e) {
responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
}
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
if (responseData.getContentLength() > maxLength) {
throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + uri);
}
}
responseData.setCharSet(geCharSet(tempFile));
if (tempFile.length() < maxCachedContentSize) {
try (InputStream contentStream = new BufferedInputStream(new FileInputStream(tempFile))) {
responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
}
} else {
outputFile = File.createTempFile("crawler-FtpClient-", ".out");
CopyUtil.copy(tempFile, outputFile);
responseData.setResponseBody(outputFile, true);
}
ftpClientQueue.offer(client);
} catch (final CrawlingAccessException e) {
ftpClientQueue.offer(client);
throw e;
} catch (final Exception e) {
logger.warn("I/O Exception.", e);
disconnectInternalClient(client);
responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
} finally {
if (tempFile != null && !tempFile.delete()) {
logger.warn("Could not delete " + tempFile.getAbsolutePath());
}
}
}
} else if (file.isDirectory() || file.isSymbolicLink()) {
final Set<RequestData> requestDataSet = new HashSet<>();
if (includeContent) {
try {
final FTPFile[] ftpFiles = client.listFiles(ftpInfo.getName(), FTPFileFilters.NON_NULL);
validateRequest(client);
for (final FTPFile f : ftpFiles) {
final String chileUri = ftpInfo.toChildUrl(f.getName());
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
}
} catch (final IOException e) {
disconnectInternalClient(client);
throw new CrawlingAccessException("Could not access " + uri, e);
}
}
ftpClientQueue.offer(client);
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
} else {
responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
}
}
use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.
the class LhaExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
File tempFile = null;
LhaFile lhaFile = null;
try {
tempFile = File.createTempFile("crawler-", ".lzh");
try (FileOutputStream fos = new FileOutputStream(tempFile)) {
CopyUtil.copy(in, fos);
}
lhaFile = new LhaFile(tempFile);
@SuppressWarnings("unchecked") final Enumeration<LhaHeader> entries = lhaFile.entries();
long contentSize = 0;
while (entries.hasMoreElements()) {
final LhaHeader head = entries.nextElement();
contentSize += head.getOriginalSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = head.getPath();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
InputStream is = null;
try {
is = lhaFile.getInputStream(head);
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(is), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
} finally {
CloseableUtil.closeQuietly(is);
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
throw new ExtractException("Could not extract a content.", e);
} finally {
if (lhaFile != null) {
try {
lhaFile.close();
} catch (final IOException e) {
// ignore
}
}
if (tempFile != null && !tempFile.delete()) {
logger.warn("Failed to delete " + tempFile.getAbsolutePath());
}
}
return new ExtractData(buf.toString().trim());
}
Aggregations