use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class CrawlerThread method run.
/*
* (non-Javadoc)
*
* @see java.lang.Runnable#run()
*/
@Override
public void run() {
log(logHelper, LogType.START_THREAD, crawlerContext);
int threadCheckCount = 0;
// set urlQueue to thread
CrawlingParameterUtil.setCrawlerContext(crawlerContext);
CrawlingParameterUtil.setUrlQueueService(urlQueueService);
CrawlingParameterUtil.setDataService(dataService);
try {
while (crawlerContext.getStatus() != CrawlerStatus.DONE && isContinue(threadCheckCount)) {
final UrlQueue<?> urlQueue = urlQueueService.poll(crawlerContext.sessionId);
if (isValid(urlQueue)) {
ResponseData responseData = null;
log(logHelper, LogType.START_CRAWLING, crawlerContext, urlQueue);
try {
final CrawlerClient client = getClient(urlQueue.getUrl());
if (client == null) {
log(logHelper, LogType.UNSUPPORTED_URL_AT_CRAWLING_STARTED, crawlerContext, urlQueue);
continue;
}
startCrawling();
// set urlQueue to thread
CrawlingParameterUtil.setUrlQueue(urlQueue);
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.PRE_PROCESSING);
}
final boolean contentUpdated = isContentUpdated(client, urlQueue);
if (contentUpdated) {
log(logHelper, LogType.GET_CONTENT, crawlerContext, urlQueue);
// access an url
final long startTime = SystemUtil.currentTimeMillis();
responseData = client.execute(RequestDataBuilder.newRequestData().method(urlQueue.getMethod()).url(urlQueue.getUrl()).build());
responseData.setExecutionTime(SystemUtil.currentTimeMillis() - startTime);
responseData.setParentUrl(urlQueue.getParentUrl());
responseData.setSessionId(crawlerContext.sessionId);
if (responseData.getRedirectLocation() == null) {
log(logHelper, LogType.PROCESS_RESPONSE, crawlerContext, urlQueue, responseData);
processResponse(urlQueue, responseData);
} else {
log(logHelper, LogType.REDIRECT_LOCATION, crawlerContext, urlQueue, responseData);
// redirect
storeChildUrl(responseData.getRedirectLocation(), urlQueue.getUrl(), null, urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
}
}
log(logHelper, LogType.FINISHED_CRAWLING, crawlerContext, urlQueue);
} catch (final ChildUrlsException e) {
try {
final Set<RequestData> childUrlSet = e.getChildUrlList();
log(logHelper, LogType.PROCESS_CHILD_URLS_BY_EXCEPTION, crawlerContext, urlQueue, childUrlSet);
// add an url
storeChildUrls(childUrlSet, urlQueue.getUrl(), urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
} catch (final Exception e1) {
log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e1);
}
if (noWaitOnFolder) {
continue;
}
} catch (final CrawlingAccessException e) {
log(logHelper, LogType.CRAWLING_ACCESS_EXCEPTION, crawlerContext, urlQueue, e);
} catch (final Throwable e) {
log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e);
} finally {
addSitemapsFromRobotsTxt(urlQueue);
if (responseData != null) {
CloseableUtil.closeQuietly(responseData);
}
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.POST_PROCESSING);
}
// clear
threadCheckCount = 0;
// remove urlQueue from thread
CrawlingParameterUtil.setUrlQueue(null);
finishCrawling();
}
} else {
log(logHelper, LogType.NO_URL_IN_QUEUE, crawlerContext, urlQueue, Integer.valueOf(threadCheckCount));
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.NO_URL_IN_QUEUE);
}
threadCheckCount++;
}
// interval
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.WAIT_NEW_URL);
}
}
} catch (final Throwable t) {
log(logHelper, LogType.SYSTEM_ERROR, t);
} finally {
// remove crawlerContext from thread
CrawlingParameterUtil.setCrawlerContext(null);
CrawlingParameterUtil.setUrlQueueService(null);
CrawlingParameterUtil.setDataService(null);
}
log(logHelper, LogType.FINISHED_THREAD, crawlerContext);
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class SmbClient method getResponseData.
protected ResponseData getResponseData(final String uri, final boolean includeContent) {
final ResponseData responseData = new ResponseData();
responseData.setMethod(Constants.GET_METHOD);
final String filePath = preprocessUri(uri);
responseData.setUrl(filePath);
SmbFile file = null;
final SmbAuthentication smbAuthentication = smbAuthenticationHolder.get(filePath);
if (logger.isDebugEnabled()) {
logger.debug("Creating SmbFile: " + filePath);
}
try {
if (smbAuthentication == null) {
file = new SmbFile(filePath);
} else {
file = new SmbFile(filePath, smbAuthentication.getAuthentication());
}
} catch (final MalformedURLException e) {
logger.warn("Could not parse url: " + filePath, e);
}
if (logger.isDebugEnabled()) {
logger.debug("Processing SmbFile: " + filePath);
}
try {
if (file == null) {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
} else if (file.isFile()) {
if (logger.isDebugEnabled()) {
logger.debug("Checking SmbFile Size: " + filePath);
}
responseData.setContentLength(file.length());
checkMaxContentLength(responseData);
responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
responseData.setCharSet(geCharSet(file));
responseData.setLastModified(new Date(file.lastModified()));
responseData.addMetaData(SMB_CREATE_TIME, new Date(file.createTime()));
try {
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile Owner: " + filePath);
}
final SID ownerUser = file.getOwnerUser();
if (ownerUser != null) {
final String[] ownerAttributes = { ownerUser.getAccountName(), ownerUser.getDomainName() };
responseData.addMetaData(SMB_OWNER_ATTRIBUTES, ownerAttributes);
}
} catch (final IOException e) {
logger.warn("Cannot get owner of the file: " + filePath);
}
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile ACL: " + filePath);
}
processAccessControlEntries(responseData, file);
final Map<String, List<String>> headerFieldMap = file.getHeaderFields();
if (headerFieldMap != null) {
for (final Map.Entry<String, List<String>> entry : headerFieldMap.entrySet()) {
responseData.addMetaData(entry.getKey(), entry.getValue());
}
}
if (file.canRead()) {
final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
if (includeContent) {
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile Content: " + filePath);
}
if (file.getContentLength() < maxCachedContentSize) {
try (InputStream contentStream = new BufferedInputStream(new SmbFileInputStream(file))) {
responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
} catch (final Exception e) {
logger.warn("I/O Exception.", e);
responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
}
} else {
File outputFile = null;
try {
outputFile = File.createTempFile("crawler-SmbClient-", ".out");
copy(file, outputFile);
responseData.setResponseBody(outputFile, true);
} catch (final Exception e) {
logger.warn("I/O Exception.", e);
responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
if (outputFile != null && !outputFile.delete()) {
logger.warn("Could not delete " + outputFile.getAbsolutePath());
}
}
}
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile MIME Type: " + filePath);
}
try (final InputStream is = responseData.getResponseBody()) {
responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
} catch (final Exception e) {
responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
}
} else {
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile MIME Type: " + filePath);
}
try (final InputStream is = new SmbFileInputStream(file)) {
responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
} catch (final Exception e) {
responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
}
}
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
if (responseData.getContentLength() > maxLength) {
throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + filePath);
}
}
} else {
// Forbidden
responseData.setHttpStatusCode(Constants.FORBIDDEN_STATUS_CODE);
responseData.setMimeType(APPLICATION_OCTET_STREAM);
}
} else if (file.isDirectory()) {
if (logger.isDebugEnabled()) {
logger.debug("Parsing SmbFile Directory: " + filePath);
}
final Set<RequestData> requestDataSet = new HashSet<>(100);
if (includeContent) {
final SmbFile[] files = file.listFiles();
if (files != null) {
for (final SmbFile f : files) {
final String chileUri = f.toString();
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
}
}
}
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
} else {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
}
} catch (final CrawlerSystemException e) {
CloseableUtil.closeQuietly(responseData);
throw e;
} catch (final SmbException e) {
CloseableUtil.closeQuietly(responseData);
throw new CrawlingAccessException("Could not access " + uri, e);
}
return responseData;
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class ResponseDataUtil method createResponseBodyFile.
public static File createResponseBodyFile(final ResponseData responseData) {
File tempFile = null;
FileOutputStream fos = null;
try (final InputStream is = responseData.getResponseBody()) {
tempFile = File.createTempFile("crawler-", ".tmp");
fos = new FileOutputStream(tempFile);
CopyUtil.copy(is, fos);
} catch (final Exception e) {
// for deleting file
CloseableUtil.closeQuietly(fos);
// clean up
if (tempFile != null && !tempFile.delete()) {
logger.warn("Could not delete a temp file: " + tempFile);
}
throw new CrawlingAccessException("Could not read a response body: " + responseData.getUrl(), e);
} finally {
CloseableUtil.closeQuietly(fos);
}
return tempFile;
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class BinaryTransformer method transform.
/*
* (non-Javadoc)
*
* @see
* org.codelibs.fess.crawler.transformer.Transformer#getData(org.codelibs.fess.crawler.entity
* .AccessResultData)
*/
@Override
public ResultData transform(final ResponseData responseData) {
if (responseData == null || !responseData.hasResponseBody()) {
throw new CrawlingAccessException("No response body.");
}
final ResultData resultData = new ResultData();
resultData.setTransformerName(getName());
try (BufferedInputStream bis = new BufferedInputStream(responseData.getResponseBody())) {
resultData.setData(IOUtils.toByteArray(bis));
resultData.setEncoding(responseData.getCharSet());
return resultData;
} catch (final IOException e) {
throw new CrawlerSystemException("Could not convert the input stream.", e);
}
}
use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.
the class HtmlTransformer method loadCharset.
protected String loadCharset(final InputStream inputStream) {
BufferedInputStream bis = null;
String encoding = null;
try {
bis = new BufferedInputStream(inputStream);
final byte[] buffer = new byte[preloadSizeForCharset];
final int size = bis.read(buffer);
if (size != -1) {
final String content = new String(buffer, 0, size);
encoding = parseCharset(content);
}
} catch (final IOException e) {
throw new CrawlingAccessException("Could not load a content.", e);
}
return normalizeEncoding(encoding);
}
Aggregations