Search in sources :

Example 1 with DataService

use of org.codelibs.fess.crawler.service.DataService in project fess-crawler by codelibs.

the class CrawlerTest method setUp.

@Override
protected void setUp() throws Exception {
    super.setUp();
    final Map<String, String> featureMap = newHashMap();
    featureMap.put("http://xml.org/sax/features/namespaces", "false");
    final Map<String, String> propertyMap = newHashMap();
    final Map<String, String> childUrlRuleMap = newHashMap();
    childUrlRuleMap.put("//A", "href");
    childUrlRuleMap.put("//AREA", "href");
    childUrlRuleMap.put("//FRAME", "src");
    childUrlRuleMap.put("//IFRAME", "src");
    childUrlRuleMap.put("//IMG", "src");
    childUrlRuleMap.put("//LINK", "href");
    childUrlRuleMap.put("//SCRIPT", "src");
    container = new StandardCrawlerContainer();
    container.<HcHttpClient>prototype("internalHttpClient", HcHttpClient.class, client -> {
        client.setCookieSpec(CookieSpecs.BEST_MATCH);
        client.setClientConnectionManager(container.getComponent("clientConnectionManager"));
    }).prototype("httpClient", FaultTolerantClient.class, client -> {
        client.setCrawlerClient(container.getComponent("internalHttpClient"));
        client.setMaxRetryCount(5);
        client.setRetryInterval(500);
    }).prototype("fsClient", FileSystemClient.class).prototype("ruleManager", RuleManagerImpl.class, manager -> {
        manager.addRule(container.getComponent("sitemapsRule"));
        manager.addRule(container.getComponent("fileRule"));
    }).prototype("accessResult", AccessResultImpl.class).prototype("urlQueue", UrlQueueImpl.class).prototype("crawlerThread", CrawlerThread.class).prototype("crawler", Crawler.class).prototype("urlFilterService", UrlFilterServiceImpl.class).prototype("urlQueueService", UrlQueueServiceImpl.class).prototype("dataService", DataServiceImpl.class).prototype("urlFilter", UrlFilterImpl.class).singleton("urlConvertHelper", UrlConvertHelper.class).singleton("intervalController", DefaultIntervalController.class).singleton("sitemapsHelper", SitemapsHelper.class).singleton("logHelper", LogHelperImpl.class).singleton("encodingHelper", EncodingHelper.class).singleton("contentLengthHelper", ContentLengthHelper.class).singleton("mimeTypeHelper", MimeTypeHelperImpl.class).<FileTransformer>singleton("fileTransformer", FileTransformer.class, transformer -> {
        transformer.setName("fileTransformer");
        transformer.setFeatureMap(featureMap);
        transformer.setPropertyMap(propertyMap);
        transformer.setChildUrlRuleMap(childUrlRuleMap);
    }).singleton("dataHelper", MemoryDataHelper.class).singleton("robotsTxtHelper", RobotsTxtHelper.class).<CrawlerClientFactory>singleton("clientFactory", CrawlerClientFactory.class, factory -> {
        factory.addClient("http:.*", container.getComponent("httpClient"));
        factory.addClient("file:.*", container.getComponent("fsClient"));
    }).singleton("tikaExtractor", TikaExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
        TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
        factory.addExtractor("text/plain", tikaExtractor);
        factory.addExtractor("text/html", tikaExtractor);
    }).singleton("httpClient", // 
    HcHttpClient.class).singleton("sitemapsResponseProcessor", // 
    SitemapsResponseProcessor.class).<SitemapsRule>singleton("sitemapsRule", SitemapsRule.class, rule -> {
        rule.setResponseProcessor(container.getComponent("sitemapsResponseProcessor"));
        rule.setRuleId("sitemapsRule");
        rule.addRule("url", ".*sitemap.*");
    }).<// 
    DefaultResponseProcessor>singleton("defaultResponseProcessor", DefaultResponseProcessor.class, processor -> {
        processor.setTransformer(container.getComponent("fileTransformer"));
        processor.setSuccessfulHttpCodes(new int[] { 200 });
        processor.setNotModifiedHttpCodes(new int[] { 304 });
    }).<// 
    RegexRule>singleton("fileRule", RegexRule.class, rule -> {
        rule.setRuleId("fileRule");
        rule.setDefaultRule(true);
        rule.setResponseProcessor(container.getComponent("defaultResponseProcessor"));
    }).<// 
    PoolingHttpClientConnectionManager>singleton("clientConnectionManager", new PoolingHttpClientConnectionManager(5, TimeUnit.MINUTES), manager -> {
        manager.setMaxTotal(200);
        manager.setDefaultMaxPerRoute(20);
    });
    crawler = container.getComponent("crawler");
    dataService = container.getComponent("dataService");
    urlQueueService = container.getComponent("urlQueueService");
    fileTransformer = container.getComponent("fileTransformer");
}
Also used : StandardCrawlerContainer(org.codelibs.fess.crawler.container.StandardCrawlerContainer) MimeTypeHelperImpl(org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl) HcHttpClient(org.codelibs.fess.crawler.client.http.HcHttpClient) UrlQueueImpl(org.codelibs.fess.crawler.entity.UrlQueueImpl) PlainTestCase(org.dbflute.utflute.core.PlainTestCase) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) FileSystemClient(org.codelibs.fess.crawler.client.fs.FileSystemClient) DataService(org.codelibs.fess.crawler.service.DataService) CookieSpecs(org.apache.http.client.config.CookieSpecs) SitemapsHelper(org.codelibs.fess.crawler.helper.SitemapsHelper) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) RuleManagerImpl(org.codelibs.fess.crawler.rule.impl.RuleManagerImpl) DataServiceImpl(org.codelibs.fess.crawler.service.impl.DataServiceImpl) RegexRule(org.codelibs.fess.crawler.rule.impl.RegexRule) FaultTolerantClient(org.codelibs.fess.crawler.client.FaultTolerantClient) LogHelperImpl(org.codelibs.fess.crawler.helper.impl.LogHelperImpl) SitemapsRule(org.codelibs.fess.crawler.rule.impl.SitemapsRule) Map(java.util.Map) UrlFilterServiceImpl(org.codelibs.fess.crawler.service.impl.UrlFilterServiceImpl) PoolingHttpClientConnectionManager(org.apache.http.impl.conn.PoolingHttpClientConnectionManager) CrawlerWebServer(org.codelibs.fess.crawler.util.CrawlerWebServer) ContentLengthHelper(org.codelibs.fess.crawler.helper.ContentLengthHelper) EncodingHelper(org.codelibs.fess.crawler.helper.EncodingHelper) RobotsTxtHelper(org.codelibs.fess.crawler.helper.RobotsTxtHelper) AccessResultImpl(org.codelibs.fess.crawler.entity.AccessResultImpl) SitemapsResponseProcessor(org.codelibs.fess.crawler.processor.impl.SitemapsResponseProcessor) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) MemoryDataHelper(org.codelibs.fess.crawler.helper.MemoryDataHelper) DefaultIntervalController(org.codelibs.fess.crawler.interval.impl.DefaultIntervalController) ResourceUtil(org.codelibs.core.io.ResourceUtil) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor) File(java.io.File) FileTransformer(org.codelibs.fess.crawler.transformer.impl.FileTransformer) TimeUnit(java.util.concurrent.TimeUnit) UrlQueueService(org.codelibs.fess.crawler.service.UrlQueueService) UrlConvertHelper(org.codelibs.fess.crawler.helper.UrlConvertHelper) UrlQueueServiceImpl(org.codelibs.fess.crawler.service.impl.UrlQueueServiceImpl) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) LogHelperImpl(org.codelibs.fess.crawler.helper.impl.LogHelperImpl) ContentLengthHelper(org.codelibs.fess.crawler.helper.ContentLengthHelper) HcHttpClient(org.codelibs.fess.crawler.client.http.HcHttpClient) DefaultIntervalController(org.codelibs.fess.crawler.interval.impl.DefaultIntervalController) RobotsTxtHelper(org.codelibs.fess.crawler.helper.RobotsTxtHelper) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) StandardCrawlerContainer(org.codelibs.fess.crawler.container.StandardCrawlerContainer) UrlQueueImpl(org.codelibs.fess.crawler.entity.UrlQueueImpl) RuleManagerImpl(org.codelibs.fess.crawler.rule.impl.RuleManagerImpl) PoolingHttpClientConnectionManager(org.apache.http.impl.conn.PoolingHttpClientConnectionManager) FaultTolerantClient(org.codelibs.fess.crawler.client.FaultTolerantClient) FileTransformer(org.codelibs.fess.crawler.transformer.impl.FileTransformer) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) SitemapsRule(org.codelibs.fess.crawler.rule.impl.SitemapsRule) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor) RegexRule(org.codelibs.fess.crawler.rule.impl.RegexRule) UrlQueueServiceImpl(org.codelibs.fess.crawler.service.impl.UrlQueueServiceImpl)

Example 2 with DataService

use of org.codelibs.fess.crawler.service.DataService in project fess by codelibs.

the class IndexUpdater method run.

@Override
public void run() {
    if (dataService == null) {
        throw new FessSystemException("DataService is null.");
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Starting indexUpdater.");
    }
    executeTime = 0;
    documentSize = 0;
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final long updateInterval = fessConfig.getIndexerWebfsUpdateIntervalAsInteger().longValue();
    final int maxEmptyListCount = fessConfig.getIndexerWebfsMaxEmptyListCountAsInteger();
    final IntervalControlHelper intervalControlHelper = ComponentUtil.getIntervalControlHelper();
    try {
        final Consumer<SearchRequestBuilder> cb = builder -> {
            final QueryBuilder queryBuilder = QueryBuilders.boolQuery().filter(QueryBuilders.termsQuery(EsAccessResult.SESSION_ID, sessionIdList)).filter(QueryBuilders.termQuery(EsAccessResult.STATUS, org.codelibs.fess.crawler.Constants.OK_STATUS));
            builder.setQuery(queryBuilder);
            builder.setFrom(0);
            final int maxDocumentCacheSize = fessConfig.getIndexerWebfsMaxDocumentCacheSizeAsInteger();
            builder.setSize(maxDocumentCacheSize <= 0 ? 1 : maxDocumentCacheSize);
            builder.addSort(EsAccessResult.CREATE_TIME, SortOrder.ASC);
        };
        final DocList docList = new DocList();
        final List<EsAccessResult> accessResultList = new ArrayList<>();
        long updateTime = System.currentTimeMillis();
        int errorCount = 0;
        int emptyListCount = 0;
        long cleanupTime = -1;
        while (!finishCrawling || !accessResultList.isEmpty()) {
            try {
                final int sessionIdListSize = finishedSessionIdList.size();
                intervalControlHelper.setCrawlerRunning(true);
                updateTime = System.currentTimeMillis() - updateTime;
                final long interval = updateInterval - updateTime;
                if (interval > 0) {
                    // sleep
                    // 10 sec (default)
                    ThreadUtil.sleep(interval);
                }
                systemHelper.calibrateCpuLoad();
                docList.clear();
                accessResultList.clear();
                intervalControlHelper.delayByRules();
                if (logger.isDebugEnabled()) {
                    logger.debug("Processing documents in IndexUpdater queue.");
                }
                updateTime = System.currentTimeMillis();
                List<EsAccessResult> arList = getAccessResultList(cb, cleanupTime);
                if (arList.isEmpty()) {
                    emptyListCount++;
                } else {
                    // reset
                    emptyListCount = 0;
                }
                long hitCount = ((EsResultList<EsAccessResult>) arList).getTotalHits();
                while (hitCount > 0) {
                    if (arList.isEmpty()) {
                        ThreadUtil.sleep(fessConfig.getIndexerWebfsCommitMarginTimeAsInteger().longValue());
                        cleanupTime = -1;
                    } else {
                        processAccessResults(docList, accessResultList, arList);
                        cleanupTime = cleanupAccessResults(accessResultList);
                    }
                    arList = getAccessResultList(cb, cleanupTime);
                    hitCount = ((EsResultList<EsAccessResult>) arList).getTotalHits();
                }
                if (!docList.isEmpty()) {
                    indexingHelper.sendDocuments(searchEngineClient, docList);
                }
                synchronized (finishedSessionIdList) {
                    if (sessionIdListSize != 0 && sessionIdListSize == finishedSessionIdList.size()) {
                        cleanupFinishedSessionData();
                    }
                }
                executeTime += System.currentTimeMillis() - updateTime;
                if (logger.isDebugEnabled()) {
                    logger.debug("Processed documents in IndexUpdater queue.");
                }
                // reset count
                errorCount = 0;
            } catch (final Exception e) {
                if (errorCount > maxErrorCount) {
                    throw e;
                }
                errorCount++;
                logger.warn("Failed to access data. Retry to access it {} times.", errorCount, e);
            } finally {
                if (systemHelper.isForceStop()) {
                    finishCrawling = true;
                    if (logger.isDebugEnabled()) {
                        logger.debug("Stopped indexUpdater.");
                    }
                }
            }
            if (emptyListCount >= maxEmptyListCount) {
                if (logger.isInfoEnabled()) {
                    logger.info("Terminating indexUpdater. emptyListCount is over {}.", maxEmptyListCount);
                }
                // terminate crawling
                finishCrawling = true;
                forceStop();
                if (fessConfig.getIndexerThreadDumpEnabledAsBoolean()) {
                    ThreadDumpUtil.printThreadDump();
                }
                org.codelibs.fess.exec.Crawler.addError("QueueTimeout");
            }
            if (!ComponentUtil.available()) {
                logger.info("IndexUpdater is terminated.");
                forceStop();
                break;
            }
        }
        if (logger.isDebugEnabled()) {
            logger.debug("Finished indexUpdater.");
        }
    } catch (final ContainerNotAvailableException e) {
        if (logger.isDebugEnabled()) {
            logger.error("IndexUpdater is terminated.", e);
        } else if (logger.isInfoEnabled()) {
            logger.info("IndexUpdater is terminated.");
        }
        forceStop();
    } catch (final Throwable t) {
        if (ComponentUtil.available()) {
            logger.error("IndexUpdater is terminated.", t);
        } else if (logger.isDebugEnabled()) {
            logger.error("IndexUpdater is terminated.", t);
            org.codelibs.fess.exec.Crawler.addError(t.getClass().getSimpleName());
        } else if (logger.isInfoEnabled()) {
            logger.info("IndexUpdater is terminated.");
            org.codelibs.fess.exec.Crawler.addError(t.getClass().getSimpleName());
        }
        forceStop();
    } finally {
        intervalControlHelper.setCrawlerRunning(true);
    }
    if (logger.isInfoEnabled()) {
        logger.info("[EXEC TIME] index update time: {}ms", executeTime);
    }
}
Also used : ThreadUtil(org.codelibs.core.lang.ThreadUtil) Constants(org.codelibs.fess.Constants) MemoryUtil(org.codelibs.fess.util.MemoryUtil) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) FessSystemException(org.codelibs.fess.exception.FessSystemException) DataService(org.codelibs.fess.crawler.service.DataService) EsDataService(org.codelibs.fess.crawler.service.impl.EsDataService) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ArrayList(java.util.ArrayList) PreDestroy(javax.annotation.PreDestroy) IngestFactory(org.codelibs.fess.ingest.IngestFactory) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) SortOrder(org.opensearch.search.sort.SortOrder) EsAccessResult(org.codelibs.fess.crawler.entity.EsAccessResult) EsUrlQueue(org.codelibs.fess.crawler.entity.EsUrlQueue) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) FavoriteLogBhv(org.codelibs.fess.es.log.exbhv.FavoriteLogBhv) IntervalControlHelper(org.codelibs.fess.helper.IntervalControlHelper) SearchRequestBuilder(org.opensearch.action.search.SearchRequestBuilder) UrlFilterService(org.codelibs.fess.crawler.service.UrlFilterService) Crawler(org.codelibs.fess.crawler.Crawler) QueryBuilders(org.opensearch.index.query.QueryBuilders) ClickLogBhv(org.codelibs.fess.es.log.exbhv.ClickLogBhv) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) Resource(javax.annotation.Resource) StringUtil(org.codelibs.core.lang.StringUtil) Consumer(java.util.function.Consumer) UrlQueueService(org.codelibs.fess.crawler.service.UrlQueueService) List(java.util.List) Logger(org.apache.logging.log4j.Logger) QueryBuilder(org.opensearch.index.query.QueryBuilder) SearchLogHelper(org.codelibs.fess.helper.SearchLogHelper) ComponentUtil(org.codelibs.fess.util.ComponentUtil) SystemHelper(org.codelibs.fess.helper.SystemHelper) ThreadDumpUtil(org.codelibs.fess.util.ThreadDumpUtil) PostConstruct(javax.annotation.PostConstruct) AccessResult(org.codelibs.fess.crawler.entity.AccessResult) DocList(org.codelibs.fess.util.DocList) LogManager(org.apache.logging.log4j.LogManager) Ingester(org.codelibs.fess.ingest.Ingester) EsResultList(org.codelibs.fess.crawler.util.EsResultList) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) SearchRequestBuilder(org.opensearch.action.search.SearchRequestBuilder) ArrayList(java.util.ArrayList) QueryBuilder(org.opensearch.index.query.QueryBuilder) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) FessSystemException(org.codelibs.fess.exception.FessSystemException) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) FessSystemException(org.codelibs.fess.exception.FessSystemException) EsAccessResult(org.codelibs.fess.crawler.entity.EsAccessResult) DocList(org.codelibs.fess.util.DocList) IntervalControlHelper(org.codelibs.fess.helper.IntervalControlHelper) EsResultList(org.codelibs.fess.crawler.util.EsResultList)

Aggregations

Map (java.util.Map)2 DataService (org.codelibs.fess.crawler.service.DataService)2 UrlQueueService (org.codelibs.fess.crawler.service.UrlQueueService)2 File (java.io.File)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 TimeUnit (java.util.concurrent.TimeUnit)1 Consumer (java.util.function.Consumer)1 PostConstruct (javax.annotation.PostConstruct)1 PreDestroy (javax.annotation.PreDestroy)1 Resource (javax.annotation.Resource)1 CookieSpecs (org.apache.http.client.config.CookieSpecs)1 PoolingHttpClientConnectionManager (org.apache.http.impl.conn.PoolingHttpClientConnectionManager)1 LogManager (org.apache.logging.log4j.LogManager)1 Logger (org.apache.logging.log4j.Logger)1 ResourceUtil (org.codelibs.core.io.ResourceUtil)1 StringUtil (org.codelibs.core.lang.StringUtil)1 ThreadUtil (org.codelibs.core.lang.ThreadUtil)1 Constants (org.codelibs.fess.Constants)1 Crawler (org.codelibs.fess.crawler.Crawler)1