use of org.codelibs.fess.crawler.service.DataService in project fess-crawler by codelibs.
the class CrawlerTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
final Map<String, String> featureMap = newHashMap();
featureMap.put("http://xml.org/sax/features/namespaces", "false");
final Map<String, String> propertyMap = newHashMap();
final Map<String, String> childUrlRuleMap = newHashMap();
childUrlRuleMap.put("//A", "href");
childUrlRuleMap.put("//AREA", "href");
childUrlRuleMap.put("//FRAME", "src");
childUrlRuleMap.put("//IFRAME", "src");
childUrlRuleMap.put("//IMG", "src");
childUrlRuleMap.put("//LINK", "href");
childUrlRuleMap.put("//SCRIPT", "src");
container = new StandardCrawlerContainer();
container.<HcHttpClient>prototype("internalHttpClient", HcHttpClient.class, client -> {
client.setCookieSpec(CookieSpecs.BEST_MATCH);
client.setClientConnectionManager(container.getComponent("clientConnectionManager"));
}).prototype("httpClient", FaultTolerantClient.class, client -> {
client.setCrawlerClient(container.getComponent("internalHttpClient"));
client.setMaxRetryCount(5);
client.setRetryInterval(500);
}).prototype("fsClient", FileSystemClient.class).prototype("ruleManager", RuleManagerImpl.class, manager -> {
manager.addRule(container.getComponent("sitemapsRule"));
manager.addRule(container.getComponent("fileRule"));
}).prototype("accessResult", AccessResultImpl.class).prototype("urlQueue", UrlQueueImpl.class).prototype("crawlerThread", CrawlerThread.class).prototype("crawler", Crawler.class).prototype("urlFilterService", UrlFilterServiceImpl.class).prototype("urlQueueService", UrlQueueServiceImpl.class).prototype("dataService", DataServiceImpl.class).prototype("urlFilter", UrlFilterImpl.class).singleton("urlConvertHelper", UrlConvertHelper.class).singleton("intervalController", DefaultIntervalController.class).singleton("sitemapsHelper", SitemapsHelper.class).singleton("logHelper", LogHelperImpl.class).singleton("encodingHelper", EncodingHelper.class).singleton("contentLengthHelper", ContentLengthHelper.class).singleton("mimeTypeHelper", MimeTypeHelperImpl.class).<FileTransformer>singleton("fileTransformer", FileTransformer.class, transformer -> {
transformer.setName("fileTransformer");
transformer.setFeatureMap(featureMap);
transformer.setPropertyMap(propertyMap);
transformer.setChildUrlRuleMap(childUrlRuleMap);
}).singleton("dataHelper", MemoryDataHelper.class).singleton("robotsTxtHelper", RobotsTxtHelper.class).<CrawlerClientFactory>singleton("clientFactory", CrawlerClientFactory.class, factory -> {
factory.addClient("http:.*", container.getComponent("httpClient"));
factory.addClient("file:.*", container.getComponent("fsClient"));
}).singleton("tikaExtractor", TikaExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
factory.addExtractor("text/plain", tikaExtractor);
factory.addExtractor("text/html", tikaExtractor);
}).singleton("httpClient", //
HcHttpClient.class).singleton("sitemapsResponseProcessor", //
SitemapsResponseProcessor.class).<SitemapsRule>singleton("sitemapsRule", SitemapsRule.class, rule -> {
rule.setResponseProcessor(container.getComponent("sitemapsResponseProcessor"));
rule.setRuleId("sitemapsRule");
rule.addRule("url", ".*sitemap.*");
}).<//
DefaultResponseProcessor>singleton("defaultResponseProcessor", DefaultResponseProcessor.class, processor -> {
processor.setTransformer(container.getComponent("fileTransformer"));
processor.setSuccessfulHttpCodes(new int[] { 200 });
processor.setNotModifiedHttpCodes(new int[] { 304 });
}).<//
RegexRule>singleton("fileRule", RegexRule.class, rule -> {
rule.setRuleId("fileRule");
rule.setDefaultRule(true);
rule.setResponseProcessor(container.getComponent("defaultResponseProcessor"));
}).<//
PoolingHttpClientConnectionManager>singleton("clientConnectionManager", new PoolingHttpClientConnectionManager(5, TimeUnit.MINUTES), manager -> {
manager.setMaxTotal(200);
manager.setDefaultMaxPerRoute(20);
});
crawler = container.getComponent("crawler");
dataService = container.getComponent("dataService");
urlQueueService = container.getComponent("urlQueueService");
fileTransformer = container.getComponent("fileTransformer");
}
use of org.codelibs.fess.crawler.service.DataService in project fess by codelibs.
the class IndexUpdater method run.
@Override
public void run() {
if (dataService == null) {
throw new FessSystemException("DataService is null.");
}
if (logger.isDebugEnabled()) {
logger.debug("Starting indexUpdater.");
}
executeTime = 0;
documentSize = 0;
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final long updateInterval = fessConfig.getIndexerWebfsUpdateIntervalAsInteger().longValue();
final int maxEmptyListCount = fessConfig.getIndexerWebfsMaxEmptyListCountAsInteger();
final IntervalControlHelper intervalControlHelper = ComponentUtil.getIntervalControlHelper();
try {
final Consumer<SearchRequestBuilder> cb = builder -> {
final QueryBuilder queryBuilder = QueryBuilders.boolQuery().filter(QueryBuilders.termsQuery(EsAccessResult.SESSION_ID, sessionIdList)).filter(QueryBuilders.termQuery(EsAccessResult.STATUS, org.codelibs.fess.crawler.Constants.OK_STATUS));
builder.setQuery(queryBuilder);
builder.setFrom(0);
final int maxDocumentCacheSize = fessConfig.getIndexerWebfsMaxDocumentCacheSizeAsInteger();
builder.setSize(maxDocumentCacheSize <= 0 ? 1 : maxDocumentCacheSize);
builder.addSort(EsAccessResult.CREATE_TIME, SortOrder.ASC);
};
final DocList docList = new DocList();
final List<EsAccessResult> accessResultList = new ArrayList<>();
long updateTime = System.currentTimeMillis();
int errorCount = 0;
int emptyListCount = 0;
long cleanupTime = -1;
while (!finishCrawling || !accessResultList.isEmpty()) {
try {
final int sessionIdListSize = finishedSessionIdList.size();
intervalControlHelper.setCrawlerRunning(true);
updateTime = System.currentTimeMillis() - updateTime;
final long interval = updateInterval - updateTime;
if (interval > 0) {
// sleep
// 10 sec (default)
ThreadUtil.sleep(interval);
}
systemHelper.calibrateCpuLoad();
docList.clear();
accessResultList.clear();
intervalControlHelper.delayByRules();
if (logger.isDebugEnabled()) {
logger.debug("Processing documents in IndexUpdater queue.");
}
updateTime = System.currentTimeMillis();
List<EsAccessResult> arList = getAccessResultList(cb, cleanupTime);
if (arList.isEmpty()) {
emptyListCount++;
} else {
// reset
emptyListCount = 0;
}
long hitCount = ((EsResultList<EsAccessResult>) arList).getTotalHits();
while (hitCount > 0) {
if (arList.isEmpty()) {
ThreadUtil.sleep(fessConfig.getIndexerWebfsCommitMarginTimeAsInteger().longValue());
cleanupTime = -1;
} else {
processAccessResults(docList, accessResultList, arList);
cleanupTime = cleanupAccessResults(accessResultList);
}
arList = getAccessResultList(cb, cleanupTime);
hitCount = ((EsResultList<EsAccessResult>) arList).getTotalHits();
}
if (!docList.isEmpty()) {
indexingHelper.sendDocuments(searchEngineClient, docList);
}
synchronized (finishedSessionIdList) {
if (sessionIdListSize != 0 && sessionIdListSize == finishedSessionIdList.size()) {
cleanupFinishedSessionData();
}
}
executeTime += System.currentTimeMillis() - updateTime;
if (logger.isDebugEnabled()) {
logger.debug("Processed documents in IndexUpdater queue.");
}
// reset count
errorCount = 0;
} catch (final Exception e) {
if (errorCount > maxErrorCount) {
throw e;
}
errorCount++;
logger.warn("Failed to access data. Retry to access it {} times.", errorCount, e);
} finally {
if (systemHelper.isForceStop()) {
finishCrawling = true;
if (logger.isDebugEnabled()) {
logger.debug("Stopped indexUpdater.");
}
}
}
if (emptyListCount >= maxEmptyListCount) {
if (logger.isInfoEnabled()) {
logger.info("Terminating indexUpdater. emptyListCount is over {}.", maxEmptyListCount);
}
// terminate crawling
finishCrawling = true;
forceStop();
if (fessConfig.getIndexerThreadDumpEnabledAsBoolean()) {
ThreadDumpUtil.printThreadDump();
}
org.codelibs.fess.exec.Crawler.addError("QueueTimeout");
}
if (!ComponentUtil.available()) {
logger.info("IndexUpdater is terminated.");
forceStop();
break;
}
}
if (logger.isDebugEnabled()) {
logger.debug("Finished indexUpdater.");
}
} catch (final ContainerNotAvailableException e) {
if (logger.isDebugEnabled()) {
logger.error("IndexUpdater is terminated.", e);
} else if (logger.isInfoEnabled()) {
logger.info("IndexUpdater is terminated.");
}
forceStop();
} catch (final Throwable t) {
if (ComponentUtil.available()) {
logger.error("IndexUpdater is terminated.", t);
} else if (logger.isDebugEnabled()) {
logger.error("IndexUpdater is terminated.", t);
org.codelibs.fess.exec.Crawler.addError(t.getClass().getSimpleName());
} else if (logger.isInfoEnabled()) {
logger.info("IndexUpdater is terminated.");
org.codelibs.fess.exec.Crawler.addError(t.getClass().getSimpleName());
}
forceStop();
} finally {
intervalControlHelper.setCrawlerRunning(true);
}
if (logger.isInfoEnabled()) {
logger.info("[EXEC TIME] index update time: {}ms", executeTime);
}
}
Aggregations