use of org.codelibs.fess.es.client.SearchEngineClient in project fess by codelibs.
the class KeyMatchHelper method getBoostedDocumentList.
public List<Map<String, Object>> getBoostedDocumentList(final KeyMatch keyMatch) {
final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
String virtualHost = keyMatch.getVirtualHost();
if (StringUtil.isBlank(virtualHost)) {
virtualHost = StringUtil.EMPTY;
}
final List<Tuple3<String, QueryBuilder, ScoreFunctionBuilder<?>>> boostList = getQueryMap(virtualHost).get(toLowerCase(keyMatch.getTerm()));
if (boostList == null) {
return Collections.emptyList();
}
for (final Tuple3<String, QueryBuilder, ScoreFunctionBuilder<?>> pair : boostList) {
if (!keyMatch.getId().equals(pair.getValue1())) {
continue;
}
final FessConfig fessConfig = ComponentUtil.getFessConfig();
return searchEngineClient.getDocumentList(fessConfig.getIndexDocumentSearchIndex(), searchRequestBuilder -> {
searchRequestBuilder.setPreference(Constants.SEARCH_PREFERENCE_LOCAL).setQuery(pair.getValue2()).setSize(keyMatch.getMaxSize());
return true;
});
}
return Collections.emptyList();
}
use of org.codelibs.fess.es.client.SearchEngineClient in project fess by codelibs.
the class IndexUpdater method run.
@Override
public void run() {
if (dataService == null) {
throw new FessSystemException("DataService is null.");
}
if (logger.isDebugEnabled()) {
logger.debug("Starting indexUpdater.");
}
executeTime = 0;
documentSize = 0;
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final long updateInterval = fessConfig.getIndexerWebfsUpdateIntervalAsInteger().longValue();
final int maxEmptyListCount = fessConfig.getIndexerWebfsMaxEmptyListCountAsInteger();
final IntervalControlHelper intervalControlHelper = ComponentUtil.getIntervalControlHelper();
try {
final Consumer<SearchRequestBuilder> cb = builder -> {
final QueryBuilder queryBuilder = QueryBuilders.boolQuery().filter(QueryBuilders.termsQuery(EsAccessResult.SESSION_ID, sessionIdList)).filter(QueryBuilders.termQuery(EsAccessResult.STATUS, org.codelibs.fess.crawler.Constants.OK_STATUS));
builder.setQuery(queryBuilder);
builder.setFrom(0);
final int maxDocumentCacheSize = fessConfig.getIndexerWebfsMaxDocumentCacheSizeAsInteger();
builder.setSize(maxDocumentCacheSize <= 0 ? 1 : maxDocumentCacheSize);
builder.addSort(EsAccessResult.CREATE_TIME, SortOrder.ASC);
};
final DocList docList = new DocList();
final List<EsAccessResult> accessResultList = new ArrayList<>();
long updateTime = System.currentTimeMillis();
int errorCount = 0;
int emptyListCount = 0;
long cleanupTime = -1;
while (!finishCrawling || !accessResultList.isEmpty()) {
try {
final int sessionIdListSize = finishedSessionIdList.size();
intervalControlHelper.setCrawlerRunning(true);
updateTime = System.currentTimeMillis() - updateTime;
final long interval = updateInterval - updateTime;
if (interval > 0) {
// sleep
// 10 sec (default)
ThreadUtil.sleep(interval);
}
systemHelper.calibrateCpuLoad();
docList.clear();
accessResultList.clear();
intervalControlHelper.delayByRules();
if (logger.isDebugEnabled()) {
logger.debug("Processing documents in IndexUpdater queue.");
}
updateTime = System.currentTimeMillis();
List<EsAccessResult> arList = getAccessResultList(cb, cleanupTime);
if (arList.isEmpty()) {
emptyListCount++;
} else {
// reset
emptyListCount = 0;
}
long hitCount = ((EsResultList<EsAccessResult>) arList).getTotalHits();
while (hitCount > 0) {
if (arList.isEmpty()) {
ThreadUtil.sleep(fessConfig.getIndexerWebfsCommitMarginTimeAsInteger().longValue());
cleanupTime = -1;
} else {
processAccessResults(docList, accessResultList, arList);
cleanupTime = cleanupAccessResults(accessResultList);
}
arList = getAccessResultList(cb, cleanupTime);
hitCount = ((EsResultList<EsAccessResult>) arList).getTotalHits();
}
if (!docList.isEmpty()) {
indexingHelper.sendDocuments(searchEngineClient, docList);
}
synchronized (finishedSessionIdList) {
if (sessionIdListSize != 0 && sessionIdListSize == finishedSessionIdList.size()) {
cleanupFinishedSessionData();
}
}
executeTime += System.currentTimeMillis() - updateTime;
if (logger.isDebugEnabled()) {
logger.debug("Processed documents in IndexUpdater queue.");
}
// reset count
errorCount = 0;
} catch (final Exception e) {
if (errorCount > maxErrorCount) {
throw e;
}
errorCount++;
logger.warn("Failed to access data. Retry to access it {} times.", errorCount, e);
} finally {
if (systemHelper.isForceStop()) {
finishCrawling = true;
if (logger.isDebugEnabled()) {
logger.debug("Stopped indexUpdater.");
}
}
}
if (emptyListCount >= maxEmptyListCount) {
if (logger.isInfoEnabled()) {
logger.info("Terminating indexUpdater. emptyListCount is over {}.", maxEmptyListCount);
}
// terminate crawling
finishCrawling = true;
forceStop();
if (fessConfig.getIndexerThreadDumpEnabledAsBoolean()) {
ThreadDumpUtil.printThreadDump();
}
org.codelibs.fess.exec.Crawler.addError("QueueTimeout");
}
if (!ComponentUtil.available()) {
logger.info("IndexUpdater is terminated.");
forceStop();
break;
}
}
if (logger.isDebugEnabled()) {
logger.debug("Finished indexUpdater.");
}
} catch (final ContainerNotAvailableException e) {
if (logger.isDebugEnabled()) {
logger.error("IndexUpdater is terminated.", e);
} else if (logger.isInfoEnabled()) {
logger.info("IndexUpdater is terminated.");
}
forceStop();
} catch (final Throwable t) {
if (ComponentUtil.available()) {
logger.error("IndexUpdater is terminated.", t);
} else if (logger.isDebugEnabled()) {
logger.error("IndexUpdater is terminated.", t);
org.codelibs.fess.exec.Crawler.addError(t.getClass().getSimpleName());
} else if (logger.isInfoEnabled()) {
logger.info("IndexUpdater is terminated.");
org.codelibs.fess.exec.Crawler.addError(t.getClass().getSimpleName());
}
forceStop();
} finally {
intervalControlHelper.setCrawlerRunning(true);
}
if (logger.isInfoEnabled()) {
logger.info("[EXEC TIME] index update time: {}ms", executeTime);
}
}
use of org.codelibs.fess.es.client.SearchEngineClient in project fess by codelibs.
the class KeyMatchHelper method getDocumentList.
protected List<Map<String, Object>> getDocumentList(final KeyMatch keyMatch) {
final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
return searchEngineClient.getDocumentList(fessConfig.getIndexDocumentSearchIndex(), searchRequestBuilder -> SearchConditionBuilder.builder(searchRequestBuilder.setPreference(Constants.SEARCH_PREFERENCE_LOCAL)).searchRequestType(SearchRequestType.ADMIN_SEARCH).size(keyMatch.getMaxSize()).query(keyMatch.getQuery()).responseFields(new String[] { fessConfig.getIndexFieldDocId() }).build());
}
use of org.codelibs.fess.es.client.SearchEngineClient in project fess by codelibs.
the class IndexingHelper method deleteByQueryBuilder.
protected long deleteByQueryBuilder(final String index, final QueryBuilder queryBuilder) {
final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
searchEngineClient.admin().indices().prepareRefresh(index).execute().actionGet();
final long numOfDeleted = searchEngineClient.deleteByQuery(index, queryBuilder);
logger.info("Deleted {} old docs.", numOfDeleted);
return numOfDeleted;
}
use of org.codelibs.fess.es.client.SearchEngineClient in project fess by codelibs.
the class FessCrawlerThread method isContentUpdated.
@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
final long startTime = System.currentTimeMillis();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
final String url = urlQueue.getUrl();
ResponseData responseData = null;
try {
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
final Map<String, Object> dataMap = new HashMap<>();
dataMap.put(fessConfig.getIndexFieldUrl(), url);
final List<String> roleTypeList = new ArrayList<>();
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
if (url.startsWith("smb:") || url.startsWith("smb1:") || url.startsWith("file:") || url.startsWith("ftp:")) {
if (url.endsWith("/")) {
// directory
return true;
}
final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
roleTypeList.addAll(permissionHelper.getSmbRoleTypeList(responseData));
roleTypeList.addAll(permissionHelper.getFileRoleTypeList(responseData));
roleTypeList.addAll(permissionHelper.getFtpRoleTypeList(responseData));
}
}
dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
final String id = crawlingInfoHelper.generateId(dataMap);
if (logger.isDebugEnabled()) {
logger.debug("Searching indexed document: {}", id);
}
final Map<String, Object> document = indexingHelper.getDocument(searchEngineClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
if (document == null) {
storeChildUrlsToQueue(urlQueue, getChildUrlSet(searchEngineClient, id));
return true;
}
final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
if (expires != null && expires.getTime() < System.currentTimeMillis()) {
final Object idValue = document.get(fessConfig.getIndexFieldId());
if (idValue != null && !indexingHelper.deleteDocument(searchEngineClient, idValue.toString())) {
logger.debug("Failed to delete expired document: {}", url);
}
return true;
}
final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
if (lastModified == null) {
return true;
}
urlQueue.setLastModified(lastModified.getTime());
log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
if (responseData == null) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
}
final int httpStatusCode = responseData.getHttpStatusCode();
if (logger.isDebugEnabled()) {
logger.debug("Accessing document: {}, status: {}", url, httpStatusCode);
}
if (httpStatusCode == 404) {
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
if (!indexingHelper.deleteDocument(searchEngineClient, id)) {
logger.debug("Failed to delete 404 document: {}", url);
}
return false;
}
if (responseData.getLastModified() == null) {
return true;
}
if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setParentUrl(urlQueue.getParentUrl());
responseData.setSessionId(crawlerContext.getSessionId());
responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
processResponse(urlQueue, responseData);
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
if (documentExpires != null && !indexingHelper.updateDocument(searchEngineClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
logger.debug("Failed to update {} at {}", fessConfig.getIndexFieldExpires(), url);
}
return false;
}
} finally {
if (responseData != null) {
CloseableUtil.closeQuietly(responseData);
}
}
}
return true;
}
Aggregations