use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method addDocument.
protected void addDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
synchronized (indexUpdateCallback) {
// required check
if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) {
logger.warn("Could not add a doc. Invalid data: {}", dataMap);
return;
}
final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
logger.warn("CrawlerClient is null. Data: {}", dataMap);
return;
}
final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
long counter = 0;
final Deque<String> urlQueue = new LinkedList<>();
urlQueue.offer(url);
while (!urlQueue.isEmpty() && (maxAccessCount < 0 || counter < maxAccessCount)) {
final Map<String, Object> localDataMap = dataMap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
String processingUrl = urlQueue.poll();
if (deleteUrlList.contains(processingUrl)) {
// delete before indexing
deleteDocuments();
}
try {
for (int i = 0; i < maxRedirectCount; i++) {
processingUrl = processRequest(paramMap, localDataMap, processingUrl, client);
if (processingUrl == null) {
break;
}
counter++;
localDataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
}
} catch (final ChildUrlsException e) {
e.getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
} catch (final DataStoreCrawlingException e) {
final Throwable cause = e.getCause();
if (cause instanceof ChildUrlsException) {
((ChildUrlsException) cause).getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
} else if (maxAccessCount != 1L) {
throw e;
} else {
logger.warn("Failed to access {}.", processingUrl, e);
}
}
}
}
}
use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: {}", dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(String::trim).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}", responseProcessor, dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(RequestData::getUrl).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.
the class FtpClient method doHead.
/*
* (non-Javadoc)
*
* @see org.codelibs.robot.client.S2RobotClient#doHead(java.lang.String)
*/
@Override
public ResponseData doHead(final String url) {
try {
final ResponseData responseData = processRequest(url, false);
responseData.setMethod(Constants.HEAD_METHOD);
return responseData;
} catch (final ChildUrlsException e) {
return null;
}
}
use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.
the class FtpClient method updateResponseData.
protected void updateResponseData(final String uri, final boolean includeContent, final ResponseData responseData, FTPClient client, final FtpInfo ftpInfo, FTPFile file) {
if (file == null) {
responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
return;
}
if (file.isSymbolicLink()) {
final String link = file.getLink();
String redirect = null;
if (link == null) {
responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
return;
} else if (link.startsWith("/")) {
redirect = ftpInfo.toUrl(file.getLink());
} else if (link.startsWith("../")) {
redirect = ftpInfo.toChildUrl(file.getLink());
} else {
redirect = ftpInfo.toChildUrl("../" + file.getLink());
}
if (!uri.equals(redirect)) {
responseData.setHttpStatusCode(Constants.OK_STATUS);
responseData.setCharSet(charset);
responseData.setContentLength(0);
responseData.setRedirectLocation(redirect);
ftpClientQueue.offer(client);
return;
}
}
if (file.isFile()) {
responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
responseData.setCharSet(Constants.UTF_8);
responseData.setLastModified(file.getTimestamp().getTime());
// check file size
responseData.setContentLength(file.getSize());
checkMaxContentLength(responseData);
if (file.getUser() != null) {
responseData.addMetaData(FTP_FILE_USER, file.getUser());
}
if (file.getGroup() != null) {
responseData.addMetaData(FTP_FILE_GROUP, file.getGroup());
}
if (includeContent) {
File tempFile = null;
File outputFile = null;
try {
tempFile = File.createTempFile("ftp-", ".tmp");
try (OutputStream out = new BufferedOutputStream(new FileOutputStream(tempFile))) {
if (!client.retrieveFile(ftpInfo.getName(), out)) {
throw new CrawlingAccessException("Failed to retrieve: " + ftpInfo.toUrl());
}
}
final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
try (InputStream is = new FileInputStream(tempFile)) {
responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
} catch (final Exception e) {
responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
}
if (contentLengthHelper != null) {
final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
if (responseData.getContentLength() > maxLength) {
throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + uri);
}
}
responseData.setCharSet(geCharSet(tempFile));
if (tempFile.length() < maxCachedContentSize) {
try (InputStream contentStream = new BufferedInputStream(new FileInputStream(tempFile))) {
responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
}
} else {
outputFile = File.createTempFile("crawler-FtpClient-", ".out");
CopyUtil.copy(tempFile, outputFile);
responseData.setResponseBody(outputFile, true);
}
ftpClientQueue.offer(client);
} catch (final CrawlingAccessException e) {
ftpClientQueue.offer(client);
throw e;
} catch (final Exception e) {
logger.warn("I/O Exception.", e);
disconnectInternalClient(client);
responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
} finally {
if (tempFile != null && !tempFile.delete()) {
logger.warn("Could not delete " + tempFile.getAbsolutePath());
}
}
}
} else if (file.isDirectory() || file.isSymbolicLink()) {
final Set<RequestData> requestDataSet = new HashSet<>();
if (includeContent) {
try {
final FTPFile[] ftpFiles = client.listFiles(ftpInfo.getName(), FTPFileFilters.NON_NULL);
validateRequest(client);
for (final FTPFile f : ftpFiles) {
final String chileUri = ftpInfo.toChildUrl(f.getName());
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
}
} catch (final IOException e) {
disconnectInternalClient(client);
throw new CrawlingAccessException("Could not access " + uri, e);
}
}
ftpClientQueue.offer(client);
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
} else {
responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
responseData.setCharSet(charset);
responseData.setContentLength(0);
ftpClientQueue.offer(client);
}
}
use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.
the class CrawlerThread method run.
/*
* (non-Javadoc)
*
* @see java.lang.Runnable#run()
*/
@Override
public void run() {
log(logHelper, LogType.START_THREAD, crawlerContext);
int threadCheckCount = 0;
// set urlQueue to thread
CrawlingParameterUtil.setCrawlerContext(crawlerContext);
CrawlingParameterUtil.setUrlQueueService(urlQueueService);
CrawlingParameterUtil.setDataService(dataService);
try {
while (crawlerContext.getStatus() != CrawlerStatus.DONE && isContinue(threadCheckCount)) {
final UrlQueue<?> urlQueue = urlQueueService.poll(crawlerContext.sessionId);
if (isValid(urlQueue)) {
ResponseData responseData = null;
log(logHelper, LogType.START_CRAWLING, crawlerContext, urlQueue);
try {
final CrawlerClient client = getClient(urlQueue.getUrl());
if (client == null) {
log(logHelper, LogType.UNSUPPORTED_URL_AT_CRAWLING_STARTED, crawlerContext, urlQueue);
continue;
}
startCrawling();
// set urlQueue to thread
CrawlingParameterUtil.setUrlQueue(urlQueue);
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.PRE_PROCESSING);
}
final boolean contentUpdated = isContentUpdated(client, urlQueue);
if (contentUpdated) {
log(logHelper, LogType.GET_CONTENT, crawlerContext, urlQueue);
// access an url
final long startTime = SystemUtil.currentTimeMillis();
responseData = client.execute(RequestDataBuilder.newRequestData().method(urlQueue.getMethod()).url(urlQueue.getUrl()).build());
responseData.setExecutionTime(SystemUtil.currentTimeMillis() - startTime);
responseData.setParentUrl(urlQueue.getParentUrl());
responseData.setSessionId(crawlerContext.sessionId);
if (responseData.getRedirectLocation() == null) {
log(logHelper, LogType.PROCESS_RESPONSE, crawlerContext, urlQueue, responseData);
processResponse(urlQueue, responseData);
} else {
log(logHelper, LogType.REDIRECT_LOCATION, crawlerContext, urlQueue, responseData);
// redirect
storeChildUrl(responseData.getRedirectLocation(), urlQueue.getUrl(), null, urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
}
}
log(logHelper, LogType.FINISHED_CRAWLING, crawlerContext, urlQueue);
} catch (final ChildUrlsException e) {
try {
final Set<RequestData> childUrlSet = e.getChildUrlList();
log(logHelper, LogType.PROCESS_CHILD_URLS_BY_EXCEPTION, crawlerContext, urlQueue, childUrlSet);
// add an url
storeChildUrls(childUrlSet, urlQueue.getUrl(), urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
} catch (final Exception e1) {
log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e1);
}
if (noWaitOnFolder) {
continue;
}
} catch (final CrawlingAccessException e) {
log(logHelper, LogType.CRAWLING_ACCESS_EXCEPTION, crawlerContext, urlQueue, e);
} catch (final Throwable e) {
log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e);
} finally {
addSitemapsFromRobotsTxt(urlQueue);
if (responseData != null) {
CloseableUtil.closeQuietly(responseData);
}
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.POST_PROCESSING);
}
// clear
threadCheckCount = 0;
// remove urlQueue from thread
CrawlingParameterUtil.setUrlQueue(null);
finishCrawling();
}
} else {
log(logHelper, LogType.NO_URL_IN_QUEUE, crawlerContext, urlQueue, Integer.valueOf(threadCheckCount));
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.NO_URL_IN_QUEUE);
}
threadCheckCount++;
}
// interval
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.WAIT_NEW_URL);
}
}
} catch (final Throwable t) {
log(logHelper, LogType.SYSTEM_ERROR, t);
} finally {
// remove crawlerContext from thread
CrawlingParameterUtil.setCrawlerContext(null);
CrawlingParameterUtil.setUrlQueueService(null);
CrawlingParameterUtil.setDataService(null);
}
log(logHelper, LogType.FINISHED_THREAD, crawlerContext);
}
Aggregations