use of org.codelibs.fess.crawler.client.CrawlerClient in project fess-crawler by codelibs.
the class CrawlerThread method run.
/*
* (non-Javadoc)
*
* @see java.lang.Runnable#run()
*/
@Override
public void run() {
log(logHelper, LogType.START_THREAD, crawlerContext);
int threadCheckCount = 0;
// set urlQueue to thread
CrawlingParameterUtil.setCrawlerContext(crawlerContext);
CrawlingParameterUtil.setUrlQueueService(urlQueueService);
CrawlingParameterUtil.setDataService(dataService);
try {
while (crawlerContext.getStatus() != CrawlerStatus.DONE && isContinue(threadCheckCount)) {
final UrlQueue<?> urlQueue = urlQueueService.poll(crawlerContext.sessionId);
if (isValid(urlQueue)) {
ResponseData responseData = null;
log(logHelper, LogType.START_CRAWLING, crawlerContext, urlQueue);
try {
final CrawlerClient client = getClient(urlQueue.getUrl());
if (client == null) {
log(logHelper, LogType.UNSUPPORTED_URL_AT_CRAWLING_STARTED, crawlerContext, urlQueue);
continue;
}
startCrawling();
// set urlQueue to thread
CrawlingParameterUtil.setUrlQueue(urlQueue);
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.PRE_PROCESSING);
}
final boolean contentUpdated = isContentUpdated(client, urlQueue);
if (contentUpdated) {
log(logHelper, LogType.GET_CONTENT, crawlerContext, urlQueue);
// access an url
final long startTime = SystemUtil.currentTimeMillis();
responseData = client.execute(RequestDataBuilder.newRequestData().method(urlQueue.getMethod()).url(urlQueue.getUrl()).build());
responseData.setExecutionTime(SystemUtil.currentTimeMillis() - startTime);
responseData.setParentUrl(urlQueue.getParentUrl());
responseData.setSessionId(crawlerContext.sessionId);
if (responseData.getRedirectLocation() == null) {
log(logHelper, LogType.PROCESS_RESPONSE, crawlerContext, urlQueue, responseData);
processResponse(urlQueue, responseData);
} else {
log(logHelper, LogType.REDIRECT_LOCATION, crawlerContext, urlQueue, responseData);
// redirect
storeChildUrl(responseData.getRedirectLocation(), urlQueue.getUrl(), null, urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
}
}
log(logHelper, LogType.FINISHED_CRAWLING, crawlerContext, urlQueue);
} catch (final ChildUrlsException e) {
try {
final Set<RequestData> childUrlSet = e.getChildUrlList();
log(logHelper, LogType.PROCESS_CHILD_URLS_BY_EXCEPTION, crawlerContext, urlQueue, childUrlSet);
// add an url
storeChildUrls(childUrlSet, urlQueue.getUrl(), urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
} catch (final Exception e1) {
log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e1);
}
if (noWaitOnFolder) {
continue;
}
} catch (final CrawlingAccessException e) {
log(logHelper, LogType.CRAWLING_ACCESS_EXCEPTION, crawlerContext, urlQueue, e);
} catch (final Throwable e) {
log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e);
} finally {
addSitemapsFromRobotsTxt(urlQueue);
if (responseData != null) {
CloseableUtil.closeQuietly(responseData);
}
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.POST_PROCESSING);
}
// clear
threadCheckCount = 0;
// remove urlQueue from thread
CrawlingParameterUtil.setUrlQueue(null);
finishCrawling();
}
} else {
log(logHelper, LogType.NO_URL_IN_QUEUE, crawlerContext, urlQueue, Integer.valueOf(threadCheckCount));
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.NO_URL_IN_QUEUE);
}
threadCheckCount++;
}
// interval
if (crawlerContext.intervalController != null) {
crawlerContext.intervalController.delay(IntervalController.WAIT_NEW_URL);
}
}
} catch (final Throwable t) {
log(logHelper, LogType.SYSTEM_ERROR, t);
} finally {
// remove crawlerContext from thread
CrawlingParameterUtil.setCrawlerContext(null);
CrawlingParameterUtil.setUrlQueueService(null);
CrawlingParameterUtil.setDataService(null);
}
log(logHelper, LogType.FINISHED_THREAD, crawlerContext);
}
use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.
the class ViewHelper method asContentResponse.
public StreamResponse asContentResponse(final Map<String, Object> doc) {
if (logger.isDebugEnabled()) {
logger.debug("writing the content of: {}", doc);
}
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final String configId = DocumentUtil.getValue(doc, fessConfig.getIndexFieldConfigId(), String.class);
if (configId == null) {
throw new FessSystemException("configId is null.");
}
if (configId.length() < 2) {
throw new FessSystemException("Invalid configId: " + configId);
}
final CrawlingConfig config = crawlingConfigHelper.getCrawlingConfig(configId);
if (config == null) {
throw new FessSystemException("No crawlingConfig: " + configId);
}
final String url = DocumentUtil.getValue(doc, fessConfig.getIndexFieldUrl(), String.class);
final CrawlerClientFactory crawlerClientFactory = config.initializeClientFactory(() -> ComponentUtil.getComponent(CrawlerClientFactory.class));
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new FessSystemException("No CrawlerClient: " + configId + ", url: " + url);
}
return writeContent(configId, url, client);
}
use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.
the class DocumentHelper method processRequest.
public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
if (StringUtil.isBlank(crawlingInfoId)) {
throw new CrawlingAccessException("sessionId is null.");
}
final CrawlerClientFactory crawlerClientFactory = crawlingConfig.initializeClientFactory(ComponentUtil::getCrawlerClientFactory);
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new CrawlingAccessException("CrawlerClient is null for " + url);
}
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
final Set<RequestData> childUrlList = new HashSet<>();
childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
throw new ChildUrlsException(childUrlList, this.getClass().getName() + "#RedirectedFrom:" + url);
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setSessionId(crawlingInfoId);
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
throw new CrawlingAccessException("No url rule for " + url);
}
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (!(responseProcessor instanceof DefaultResponseProcessor)) {
throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
}
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
return (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
return null;
} catch (final Exception e) {
throw new CrawlingAccessException("Failed to parse " + url, e);
}
}
use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.
the class FessCrawlerThread method isContentUpdated.
@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
final long startTime = System.currentTimeMillis();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
final String url = urlQueue.getUrl();
ResponseData responseData = null;
try {
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
final Map<String, Object> dataMap = new HashMap<>();
dataMap.put(fessConfig.getIndexFieldUrl(), url);
final List<String> roleTypeList = new ArrayList<>();
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
if (url.startsWith("smb:") || url.startsWith("smb1:") || url.startsWith("file:") || url.startsWith("ftp:")) {
if (url.endsWith("/")) {
// directory
return true;
}
final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
roleTypeList.addAll(permissionHelper.getSmbRoleTypeList(responseData));
roleTypeList.addAll(permissionHelper.getFileRoleTypeList(responseData));
roleTypeList.addAll(permissionHelper.getFtpRoleTypeList(responseData));
}
}
dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
final String id = crawlingInfoHelper.generateId(dataMap);
if (logger.isDebugEnabled()) {
logger.debug("Searching indexed document: {}", id);
}
final Map<String, Object> document = indexingHelper.getDocument(searchEngineClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
if (document == null) {
storeChildUrlsToQueue(urlQueue, getChildUrlSet(searchEngineClient, id));
return true;
}
final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
if (expires != null && expires.getTime() < System.currentTimeMillis()) {
final Object idValue = document.get(fessConfig.getIndexFieldId());
if (idValue != null && !indexingHelper.deleteDocument(searchEngineClient, idValue.toString())) {
logger.debug("Failed to delete expired document: {}", url);
}
return true;
}
final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
if (lastModified == null) {
return true;
}
urlQueue.setLastModified(lastModified.getTime());
log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
if (responseData == null) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
}
final int httpStatusCode = responseData.getHttpStatusCode();
if (logger.isDebugEnabled()) {
logger.debug("Accessing document: {}, status: {}", url, httpStatusCode);
}
if (httpStatusCode == 404) {
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
if (!indexingHelper.deleteDocument(searchEngineClient, id)) {
logger.debug("Failed to delete 404 document: {}", url);
}
return false;
}
if (responseData.getLastModified() == null) {
return true;
}
if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setParentUrl(urlQueue.getParentUrl());
responseData.setSessionId(crawlerContext.getSessionId());
responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
processResponse(urlQueue, responseData);
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
if (documentExpires != null && !indexingHelper.updateDocument(searchEngineClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
logger.debug("Failed to update {} at {}", fessConfig.getIndexFieldExpires(), url);
}
return false;
}
} finally {
if (responseData != null) {
CloseableUtil.closeQuietly(responseData);
}
}
}
return true;
}
Aggregations