Search in sources :

Example 6 with SearchEngineClient

use of org.codelibs.fess.es.client.SearchEngineClient in project fess by codelibs.

the class KeyMatchHelper method getBoostedDocumentList.

public List<Map<String, Object>> getBoostedDocumentList(final KeyMatch keyMatch) {
    final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
    String virtualHost = keyMatch.getVirtualHost();
    if (StringUtil.isBlank(virtualHost)) {
        virtualHost = StringUtil.EMPTY;
    }
    final List<Tuple3<String, QueryBuilder, ScoreFunctionBuilder<?>>> boostList = getQueryMap(virtualHost).get(toLowerCase(keyMatch.getTerm()));
    if (boostList == null) {
        return Collections.emptyList();
    }
    for (final Tuple3<String, QueryBuilder, ScoreFunctionBuilder<?>> pair : boostList) {
        if (!keyMatch.getId().equals(pair.getValue1())) {
            continue;
        }
        final FessConfig fessConfig = ComponentUtil.getFessConfig();
        return searchEngineClient.getDocumentList(fessConfig.getIndexDocumentSearchIndex(), searchRequestBuilder -> {
            searchRequestBuilder.setPreference(Constants.SEARCH_PREFERENCE_LOCAL).setQuery(pair.getValue2()).setSize(keyMatch.getMaxSize());
            return true;
        });
    }
    return Collections.emptyList();
}
Also used : ScoreFunctionBuilder(org.opensearch.index.query.functionscore.ScoreFunctionBuilder) Tuple3(org.codelibs.core.misc.Tuple3) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) QueryBuilder(org.opensearch.index.query.QueryBuilder) BoolQueryBuilder(org.opensearch.index.query.BoolQueryBuilder) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig)

Example 7 with SearchEngineClient

use of org.codelibs.fess.es.client.SearchEngineClient in project fess by codelibs.

the class IndexUpdater method run.

@Override
public void run() {
    if (dataService == null) {
        throw new FessSystemException("DataService is null.");
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Starting indexUpdater.");
    }
    executeTime = 0;
    documentSize = 0;
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final long updateInterval = fessConfig.getIndexerWebfsUpdateIntervalAsInteger().longValue();
    final int maxEmptyListCount = fessConfig.getIndexerWebfsMaxEmptyListCountAsInteger();
    final IntervalControlHelper intervalControlHelper = ComponentUtil.getIntervalControlHelper();
    try {
        final Consumer<SearchRequestBuilder> cb = builder -> {
            final QueryBuilder queryBuilder = QueryBuilders.boolQuery().filter(QueryBuilders.termsQuery(EsAccessResult.SESSION_ID, sessionIdList)).filter(QueryBuilders.termQuery(EsAccessResult.STATUS, org.codelibs.fess.crawler.Constants.OK_STATUS));
            builder.setQuery(queryBuilder);
            builder.setFrom(0);
            final int maxDocumentCacheSize = fessConfig.getIndexerWebfsMaxDocumentCacheSizeAsInteger();
            builder.setSize(maxDocumentCacheSize <= 0 ? 1 : maxDocumentCacheSize);
            builder.addSort(EsAccessResult.CREATE_TIME, SortOrder.ASC);
        };
        final DocList docList = new DocList();
        final List<EsAccessResult> accessResultList = new ArrayList<>();
        long updateTime = System.currentTimeMillis();
        int errorCount = 0;
        int emptyListCount = 0;
        long cleanupTime = -1;
        while (!finishCrawling || !accessResultList.isEmpty()) {
            try {
                final int sessionIdListSize = finishedSessionIdList.size();
                intervalControlHelper.setCrawlerRunning(true);
                updateTime = System.currentTimeMillis() - updateTime;
                final long interval = updateInterval - updateTime;
                if (interval > 0) {
                    // sleep
                    // 10 sec (default)
                    ThreadUtil.sleep(interval);
                }
                systemHelper.calibrateCpuLoad();
                docList.clear();
                accessResultList.clear();
                intervalControlHelper.delayByRules();
                if (logger.isDebugEnabled()) {
                    logger.debug("Processing documents in IndexUpdater queue.");
                }
                updateTime = System.currentTimeMillis();
                List<EsAccessResult> arList = getAccessResultList(cb, cleanupTime);
                if (arList.isEmpty()) {
                    emptyListCount++;
                } else {
                    // reset
                    emptyListCount = 0;
                }
                long hitCount = ((EsResultList<EsAccessResult>) arList).getTotalHits();
                while (hitCount > 0) {
                    if (arList.isEmpty()) {
                        ThreadUtil.sleep(fessConfig.getIndexerWebfsCommitMarginTimeAsInteger().longValue());
                        cleanupTime = -1;
                    } else {
                        processAccessResults(docList, accessResultList, arList);
                        cleanupTime = cleanupAccessResults(accessResultList);
                    }
                    arList = getAccessResultList(cb, cleanupTime);
                    hitCount = ((EsResultList<EsAccessResult>) arList).getTotalHits();
                }
                if (!docList.isEmpty()) {
                    indexingHelper.sendDocuments(searchEngineClient, docList);
                }
                synchronized (finishedSessionIdList) {
                    if (sessionIdListSize != 0 && sessionIdListSize == finishedSessionIdList.size()) {
                        cleanupFinishedSessionData();
                    }
                }
                executeTime += System.currentTimeMillis() - updateTime;
                if (logger.isDebugEnabled()) {
                    logger.debug("Processed documents in IndexUpdater queue.");
                }
                // reset count
                errorCount = 0;
            } catch (final Exception e) {
                if (errorCount > maxErrorCount) {
                    throw e;
                }
                errorCount++;
                logger.warn("Failed to access data. Retry to access it {} times.", errorCount, e);
            } finally {
                if (systemHelper.isForceStop()) {
                    finishCrawling = true;
                    if (logger.isDebugEnabled()) {
                        logger.debug("Stopped indexUpdater.");
                    }
                }
            }
            if (emptyListCount >= maxEmptyListCount) {
                if (logger.isInfoEnabled()) {
                    logger.info("Terminating indexUpdater. emptyListCount is over {}.", maxEmptyListCount);
                }
                // terminate crawling
                finishCrawling = true;
                forceStop();
                if (fessConfig.getIndexerThreadDumpEnabledAsBoolean()) {
                    ThreadDumpUtil.printThreadDump();
                }
                org.codelibs.fess.exec.Crawler.addError("QueueTimeout");
            }
            if (!ComponentUtil.available()) {
                logger.info("IndexUpdater is terminated.");
                forceStop();
                break;
            }
        }
        if (logger.isDebugEnabled()) {
            logger.debug("Finished indexUpdater.");
        }
    } catch (final ContainerNotAvailableException e) {
        if (logger.isDebugEnabled()) {
            logger.error("IndexUpdater is terminated.", e);
        } else if (logger.isInfoEnabled()) {
            logger.info("IndexUpdater is terminated.");
        }
        forceStop();
    } catch (final Throwable t) {
        if (ComponentUtil.available()) {
            logger.error("IndexUpdater is terminated.", t);
        } else if (logger.isDebugEnabled()) {
            logger.error("IndexUpdater is terminated.", t);
            org.codelibs.fess.exec.Crawler.addError(t.getClass().getSimpleName());
        } else if (logger.isInfoEnabled()) {
            logger.info("IndexUpdater is terminated.");
            org.codelibs.fess.exec.Crawler.addError(t.getClass().getSimpleName());
        }
        forceStop();
    } finally {
        intervalControlHelper.setCrawlerRunning(true);
    }
    if (logger.isInfoEnabled()) {
        logger.info("[EXEC TIME] index update time: {}ms", executeTime);
    }
}
Also used : ThreadUtil(org.codelibs.core.lang.ThreadUtil) Constants(org.codelibs.fess.Constants) MemoryUtil(org.codelibs.fess.util.MemoryUtil) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) FessSystemException(org.codelibs.fess.exception.FessSystemException) DataService(org.codelibs.fess.crawler.service.DataService) EsDataService(org.codelibs.fess.crawler.service.impl.EsDataService) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ArrayList(java.util.ArrayList) PreDestroy(javax.annotation.PreDestroy) IngestFactory(org.codelibs.fess.ingest.IngestFactory) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) SortOrder(org.opensearch.search.sort.SortOrder) EsAccessResult(org.codelibs.fess.crawler.entity.EsAccessResult) EsUrlQueue(org.codelibs.fess.crawler.entity.EsUrlQueue) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) FavoriteLogBhv(org.codelibs.fess.es.log.exbhv.FavoriteLogBhv) IntervalControlHelper(org.codelibs.fess.helper.IntervalControlHelper) SearchRequestBuilder(org.opensearch.action.search.SearchRequestBuilder) UrlFilterService(org.codelibs.fess.crawler.service.UrlFilterService) Crawler(org.codelibs.fess.crawler.Crawler) QueryBuilders(org.opensearch.index.query.QueryBuilders) ClickLogBhv(org.codelibs.fess.es.log.exbhv.ClickLogBhv) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) Resource(javax.annotation.Resource) StringUtil(org.codelibs.core.lang.StringUtil) Consumer(java.util.function.Consumer) UrlQueueService(org.codelibs.fess.crawler.service.UrlQueueService) List(java.util.List) Logger(org.apache.logging.log4j.Logger) QueryBuilder(org.opensearch.index.query.QueryBuilder) SearchLogHelper(org.codelibs.fess.helper.SearchLogHelper) ComponentUtil(org.codelibs.fess.util.ComponentUtil) SystemHelper(org.codelibs.fess.helper.SystemHelper) ThreadDumpUtil(org.codelibs.fess.util.ThreadDumpUtil) PostConstruct(javax.annotation.PostConstruct) AccessResult(org.codelibs.fess.crawler.entity.AccessResult) DocList(org.codelibs.fess.util.DocList) LogManager(org.apache.logging.log4j.LogManager) Ingester(org.codelibs.fess.ingest.Ingester) EsResultList(org.codelibs.fess.crawler.util.EsResultList) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) SearchRequestBuilder(org.opensearch.action.search.SearchRequestBuilder) ArrayList(java.util.ArrayList) QueryBuilder(org.opensearch.index.query.QueryBuilder) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) FessSystemException(org.codelibs.fess.exception.FessSystemException) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) FessSystemException(org.codelibs.fess.exception.FessSystemException) EsAccessResult(org.codelibs.fess.crawler.entity.EsAccessResult) DocList(org.codelibs.fess.util.DocList) IntervalControlHelper(org.codelibs.fess.helper.IntervalControlHelper) EsResultList(org.codelibs.fess.crawler.util.EsResultList)

Example 8 with SearchEngineClient

use of org.codelibs.fess.es.client.SearchEngineClient in project fess by codelibs.

the class KeyMatchHelper method getDocumentList.

protected List<Map<String, Object>> getDocumentList(final KeyMatch keyMatch) {
    final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    return searchEngineClient.getDocumentList(fessConfig.getIndexDocumentSearchIndex(), searchRequestBuilder -> SearchConditionBuilder.builder(searchRequestBuilder.setPreference(Constants.SEARCH_PREFERENCE_LOCAL)).searchRequestType(SearchRequestType.ADMIN_SEARCH).size(keyMatch.getMaxSize()).query(keyMatch.getQuery()).responseFields(new String[] { fessConfig.getIndexFieldDocId() }).build());
}
Also used : SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig)

Example 9 with SearchEngineClient

use of org.codelibs.fess.es.client.SearchEngineClient in project fess by codelibs.

the class IndexingHelper method deleteByQueryBuilder.

protected long deleteByQueryBuilder(final String index, final QueryBuilder queryBuilder) {
    final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
    searchEngineClient.admin().indices().prepareRefresh(index).execute().actionGet();
    final long numOfDeleted = searchEngineClient.deleteByQuery(index, queryBuilder);
    logger.info("Deleted {} old docs.", numOfDeleted);
    return numOfDeleted;
}
Also used : SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient)

Example 10 with SearchEngineClient

use of org.codelibs.fess.es.client.SearchEngineClient in project fess by codelibs.

the class FessCrawlerThread method isContentUpdated.

@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
    if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
        final long startTime = System.currentTimeMillis();
        final FessConfig fessConfig = ComponentUtil.getFessConfig();
        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
        final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
        final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
        final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
        final String url = urlQueue.getUrl();
        ResponseData responseData = null;
        try {
            final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.put(fessConfig.getIndexFieldUrl(), url);
            final List<String> roleTypeList = new ArrayList<>();
            stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
            if (url.startsWith("smb:") || url.startsWith("smb1:") || url.startsWith("file:") || url.startsWith("ftp:")) {
                if (url.endsWith("/")) {
                    // directory
                    return true;
                }
                final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
                if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
                    // head method
                    responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                    if (responseData == null) {
                        return true;
                    }
                    roleTypeList.addAll(permissionHelper.getSmbRoleTypeList(responseData));
                    roleTypeList.addAll(permissionHelper.getFileRoleTypeList(responseData));
                    roleTypeList.addAll(permissionHelper.getFtpRoleTypeList(responseData));
                }
            }
            dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
            final String id = crawlingInfoHelper.generateId(dataMap);
            if (logger.isDebugEnabled()) {
                logger.debug("Searching indexed document: {}", id);
            }
            final Map<String, Object> document = indexingHelper.getDocument(searchEngineClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
            if (document == null) {
                storeChildUrlsToQueue(urlQueue, getChildUrlSet(searchEngineClient, id));
                return true;
            }
            final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
            if (expires != null && expires.getTime() < System.currentTimeMillis()) {
                final Object idValue = document.get(fessConfig.getIndexFieldId());
                if (idValue != null && !indexingHelper.deleteDocument(searchEngineClient, idValue.toString())) {
                    logger.debug("Failed to delete expired document: {}", url);
                }
                return true;
            }
            final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
            if (lastModified == null) {
                return true;
            }
            urlQueue.setLastModified(lastModified.getTime());
            log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
            if (responseData == null) {
                // head method
                responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                if (responseData == null) {
                    return true;
                }
            }
            final int httpStatusCode = responseData.getHttpStatusCode();
            if (logger.isDebugEnabled()) {
                logger.debug("Accessing document: {}, status: {}", url, httpStatusCode);
            }
            if (httpStatusCode == 404) {
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                if (!indexingHelper.deleteDocument(searchEngineClient, id)) {
                    logger.debug("Failed to delete 404 document: {}", url);
                }
                return false;
            }
            if (responseData.getLastModified() == null) {
                return true;
            }
            if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
                log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
                responseData.setExecutionTime(System.currentTimeMillis() - startTime);
                responseData.setParentUrl(urlQueue.getParentUrl());
                responseData.setSessionId(crawlerContext.getSessionId());
                responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
                processResponse(urlQueue, responseData);
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
                if (documentExpires != null && !indexingHelper.updateDocument(searchEngineClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
                    logger.debug("Failed to update {} at {}", fessConfig.getIndexFieldExpires(), url);
                }
                return false;
            }
        } finally {
            if (responseData != null) {
                CloseableUtil.closeQuietly(responseData);
            }
        }
    }
    return true;
}
Also used : DocumentUtil(org.codelibs.fess.util.DocumentUtil) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) Date(java.util.Date) HashMap(java.util.HashMap) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) PermissionHelper(org.codelibs.fess.helper.PermissionHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) LinkedHashSet(java.util.LinkedHashSet) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) LogType(org.codelibs.fess.crawler.log.LogType) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) StringUtil(org.codelibs.core.lang.StringUtil) Set(java.util.Set) ContentNotFoundException(org.codelibs.fess.exception.ContentNotFoundException) DuplicateHostHelper(org.codelibs.fess.helper.DuplicateHostHelper) Collectors(java.util.stream.Collectors) CloseableUtil(org.codelibs.core.io.CloseableUtil) List(java.util.List) Logger(org.apache.logging.log4j.Logger) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) LogManager(org.apache.logging.log4j.LogManager) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) HashMap(java.util.HashMap) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) ArrayList(java.util.ArrayList) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) PermissionHelper(org.codelibs.fess.helper.PermissionHelper)

Aggregations

SearchEngineClient (org.codelibs.fess.es.client.SearchEngineClient)18 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)12 IndexingHelper (org.codelibs.fess.helper.IndexingHelper)7 LogManager (org.apache.logging.log4j.LogManager)4 Logger (org.apache.logging.log4j.Logger)4 StringUtil (org.codelibs.core.lang.StringUtil)4 SystemHelper (org.codelibs.fess.helper.SystemHelper)4 ArrayList (java.util.ArrayList)3 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)3 List (java.util.List)3 Map (java.util.Map)3 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)3 ComponentUtil (org.codelibs.fess.util.ComponentUtil)3 Set (java.util.Set)2 Consumer (java.util.function.Consumer)2 PostConstruct (javax.annotation.PostConstruct)2 ThreadUtil (org.codelibs.core.lang.ThreadUtil)2 Constants (org.codelibs.fess.Constants)2 ContainerNotAvailableException (org.codelibs.fess.exception.ContainerNotAvailableException)2