use of org.codelibs.fess.crawler.util.EsResultList in project fess by codelibs.
the class IndexUpdater method getAccessResultList.
private List<EsAccessResult> getAccessResultList(final Consumer<SearchRequestBuilder> cb, final long cleanupTime) {
if (logger.isDebugEnabled()) {
logger.debug("Getting documents in IndexUpdater queue.");
}
final long execTime = System.currentTimeMillis();
final List<EsAccessResult> arList = ((EsDataService) dataService).getAccessResultList(cb);
final FessConfig fessConfig = ComponentUtil.getFessConfig();
if (!arList.isEmpty()) {
final long commitMarginTime = fessConfig.getIndexerWebfsCommitMarginTimeAsInteger().longValue();
for (final AccessResult<?> ar : arList.toArray(new AccessResult[arList.size()])) {
if (ar.getCreateTime().longValue() > execTime - commitMarginTime) {
arList.remove(ar);
}
}
}
final long totalHits = ((EsResultList<EsAccessResult>) arList).getTotalHits();
if (logger.isInfoEnabled()) {
final StringBuilder buf = new StringBuilder(100);
buf.append("Processing ");
if (totalHits > 0) {
buf.append(arList.size()).append('/').append(totalHits).append(" docs (Doc:{access ");
} else {
buf.append("no docs in indexing queue (Doc:{access ");
}
buf.append(System.currentTimeMillis() - execTime).append("ms");
if (cleanupTime >= 0) {
buf.append(", cleanup ").append(cleanupTime).append("ms");
}
buf.append("}, ");
buf.append(MemoryUtil.getMemoryUsageLog());
buf.append(')');
logger.info(buf.toString());
}
final long unprocessedDocumentSize = fessConfig.getIndexerUnprocessedDocumentSizeAsInteger().longValue();
final IntervalControlHelper intervalControlHelper = ComponentUtil.getIntervalControlHelper();
if (totalHits > unprocessedDocumentSize && intervalControlHelper.isCrawlerRunning()) {
if (logger.isInfoEnabled()) {
logger.info("Stopped all crawler threads. You have {} (>{}) unprocessed docs.", totalHits, unprocessedDocumentSize);
}
intervalControlHelper.setCrawlerRunning(false);
}
return arList;
}
use of org.codelibs.fess.crawler.util.EsResultList in project fess-crawler by codelibs.
the class AbstractCrawlerService method getList.
protected <T> List<T> getList(final Class<T> clazz, final Consumer<SearchRequestBuilder> callback) {
final SearchResponse response = getClient().get(c -> {
final SearchRequestBuilder builder = c.prepareSearch(index).setTypes(type);
callback.accept(builder);
return builder.execute();
});
final EsResultList<T> targetList = new EsResultList<>();
final SearchHits hits = response.getHits();
targetList.setTotalHits(hits.getTotalHits());
targetList.setTookInMillis(response.getTook().getMillis());
if (hits.getTotalHits() != 0) {
try {
for (final SearchHit searchHit : hits.getHits()) {
final Map<String, Object> source = searchHit.getSourceAsMap();
final T target = BeanUtil.copyMapToNewBean(source, clazz, option -> {
option.converter(new EsTimestampConverter(), timestampFields).excludeWhitespace();
option.exclude(EsAccessResult.ACCESS_RESULT_DATA);
});
@SuppressWarnings("unchecked") final Map<String, Object> data = (Map<String, Object>) source.get(EsAccessResult.ACCESS_RESULT_DATA);
if (data != null) {
((EsAccessResult) target).setAccessResultData(new EsAccessResultData(data));
}
setId(target, searchHit.getId());
targetList.add(target);
}
} catch (final Exception e) {
throw new EsAccessException("response: " + response, e);
}
}
return targetList;
}
use of org.codelibs.fess.crawler.util.EsResultList in project fess-crawler by codelibs.
the class EsDataService method getAccessResultList.
public List<EsAccessResult> getAccessResultList(final Consumer<SearchRequestBuilder> callback) {
final SearchResponse response = getClient().get(c -> {
final SearchRequestBuilder builder = c.prepareSearch(index).setTypes(type);
callback.accept(builder);
builder.setFetchSource(new String[] { "parentUrl", "method", "mimeType", "sessionId", "url", "executionTime", "createTime", "contentLength", "lastModified", "ruleId", "httpStatusCode", "status" }, null);
return builder.execute();
});
final EsResultList<EsAccessResult> targetList = new EsResultList<>();
final SearchHits hits = response.getHits();
targetList.setTotalHits(hits.getTotalHits());
targetList.setTookInMillis(response.getTook().getMillis());
if (hits.getTotalHits() != 0) {
try {
for (final SearchHit searchHit : hits.getHits()) {
final EsAccessResult target = new EsAccessResult();
final Map<String, Object> fields = searchHit.getSourceAsMap();
target.setParentUrl(getFieldValue(fields.get("parentUrl"), String.class));
target.setMethod(getFieldValue(fields.get("method"), String.class));
target.setMimeType(getFieldValue(fields.get("mimeType"), String.class));
target.setSessionId(getFieldValue(fields.get("sessionId"), String.class));
target.setUrl(getFieldValue(fields.get("url"), String.class));
target.setExecutionTime(getFieldValue(fields.get("executionTime"), Integer.class));
target.setContentLength(getFieldValue(fields.get("contentLength"), Long.class));
target.setRuleId(getFieldValue(fields.get("ruleId"), String.class));
target.setHttpStatusCode(getFieldValue(fields.get("httpStatusCode"), Integer.class));
target.setStatus(getFieldValue(fields.get("status"), Integer.class));
target.setCreateTime(getFieldValue(fields.get("createTime"), Long.class));
target.setLastModified(getFieldValue(fields.get("lastModified"), Long.class));
setId(target, searchHit.getId());
targetList.add(target);
}
} catch (final Exception e) {
throw new EsAccessException("response: " + response, e);
}
}
return targetList;
}
use of org.codelibs.fess.crawler.util.EsResultList in project fess by codelibs.
the class IndexUpdater method run.
@Override
public void run() {
if (dataService == null) {
throw new FessSystemException("DataService is null.");
}
if (logger.isDebugEnabled()) {
logger.debug("Starting indexUpdater.");
}
executeTime = 0;
documentSize = 0;
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final long updateInterval = fessConfig.getIndexerWebfsUpdateIntervalAsInteger().longValue();
final int maxEmptyListCount = fessConfig.getIndexerWebfsMaxEmptyListCountAsInteger();
final IntervalControlHelper intervalControlHelper = ComponentUtil.getIntervalControlHelper();
try {
final Consumer<SearchRequestBuilder> cb = builder -> {
final QueryBuilder queryBuilder = QueryBuilders.boolQuery().filter(QueryBuilders.termsQuery(EsAccessResult.SESSION_ID, sessionIdList)).filter(QueryBuilders.termQuery(EsAccessResult.STATUS, org.codelibs.fess.crawler.Constants.OK_STATUS));
builder.setQuery(queryBuilder);
builder.setFrom(0);
final int maxDocumentCacheSize = fessConfig.getIndexerWebfsMaxDocumentCacheSizeAsInteger();
builder.setSize(maxDocumentCacheSize <= 0 ? 1 : maxDocumentCacheSize);
builder.addSort(EsAccessResult.CREATE_TIME, SortOrder.ASC);
};
final DocList docList = new DocList();
final List<EsAccessResult> accessResultList = new ArrayList<>();
long updateTime = System.currentTimeMillis();
int errorCount = 0;
int emptyListCount = 0;
long cleanupTime = -1;
while (!finishCrawling || !accessResultList.isEmpty()) {
try {
final int sessionIdListSize = finishedSessionIdList.size();
intervalControlHelper.setCrawlerRunning(true);
updateTime = System.currentTimeMillis() - updateTime;
final long interval = updateInterval - updateTime;
if (interval > 0) {
// sleep
// 10 sec (default)
ThreadUtil.sleep(interval);
}
systemHelper.calibrateCpuLoad();
docList.clear();
accessResultList.clear();
intervalControlHelper.delayByRules();
if (logger.isDebugEnabled()) {
logger.debug("Processing documents in IndexUpdater queue.");
}
updateTime = System.currentTimeMillis();
List<EsAccessResult> arList = getAccessResultList(cb, cleanupTime);
if (arList.isEmpty()) {
emptyListCount++;
} else {
// reset
emptyListCount = 0;
}
long hitCount = ((EsResultList<EsAccessResult>) arList).getTotalHits();
while (hitCount > 0) {
if (arList.isEmpty()) {
ThreadUtil.sleep(fessConfig.getIndexerWebfsCommitMarginTimeAsInteger().longValue());
cleanupTime = -1;
} else {
processAccessResults(docList, accessResultList, arList);
cleanupTime = cleanupAccessResults(accessResultList);
}
arList = getAccessResultList(cb, cleanupTime);
hitCount = ((EsResultList<EsAccessResult>) arList).getTotalHits();
}
if (!docList.isEmpty()) {
indexingHelper.sendDocuments(searchEngineClient, docList);
}
synchronized (finishedSessionIdList) {
if (sessionIdListSize != 0 && sessionIdListSize == finishedSessionIdList.size()) {
cleanupFinishedSessionData();
}
}
executeTime += System.currentTimeMillis() - updateTime;
if (logger.isDebugEnabled()) {
logger.debug("Processed documents in IndexUpdater queue.");
}
// reset count
errorCount = 0;
} catch (final Exception e) {
if (errorCount > maxErrorCount) {
throw e;
}
errorCount++;
logger.warn("Failed to access data. Retry to access it {} times.", errorCount, e);
} finally {
if (systemHelper.isForceStop()) {
finishCrawling = true;
if (logger.isDebugEnabled()) {
logger.debug("Stopped indexUpdater.");
}
}
}
if (emptyListCount >= maxEmptyListCount) {
if (logger.isInfoEnabled()) {
logger.info("Terminating indexUpdater. emptyListCount is over {}.", maxEmptyListCount);
}
// terminate crawling
finishCrawling = true;
forceStop();
if (fessConfig.getIndexerThreadDumpEnabledAsBoolean()) {
ThreadDumpUtil.printThreadDump();
}
org.codelibs.fess.exec.Crawler.addError("QueueTimeout");
}
if (!ComponentUtil.available()) {
logger.info("IndexUpdater is terminated.");
forceStop();
break;
}
}
if (logger.isDebugEnabled()) {
logger.debug("Finished indexUpdater.");
}
} catch (final ContainerNotAvailableException e) {
if (logger.isDebugEnabled()) {
logger.error("IndexUpdater is terminated.", e);
} else if (logger.isInfoEnabled()) {
logger.info("IndexUpdater is terminated.");
}
forceStop();
} catch (final Throwable t) {
if (ComponentUtil.available()) {
logger.error("IndexUpdater is terminated.", t);
} else if (logger.isDebugEnabled()) {
logger.error("IndexUpdater is terminated.", t);
org.codelibs.fess.exec.Crawler.addError(t.getClass().getSimpleName());
} else if (logger.isInfoEnabled()) {
logger.info("IndexUpdater is terminated.");
org.codelibs.fess.exec.Crawler.addError(t.getClass().getSimpleName());
}
forceStop();
} finally {
intervalControlHelper.setCrawlerRunning(true);
}
if (logger.isInfoEnabled()) {
logger.info("[EXEC TIME] index update time: {}ms", executeTime);
}
}
Aggregations