Search in sources :

Example 1 with FessIntervalController

use of org.codelibs.fess.crawler.interval.FessIntervalController in project fess by codelibs.

the class WebFsIndexHelper method doCrawl.

protected void doCrawl(final String sessionId, final List<WebConfig> webConfigList, final List<FileConfig> fileConfigList) {
    final int multiprocessCrawlingCount = ComponentUtil.getFessConfig().getCrawlingThreadCount();
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final long startTime = System.currentTimeMillis();
    final List<String> sessionIdList = new ArrayList<>();
    crawlerList.clear();
    final List<String> crawlerStatusList = new ArrayList<>();
    // Web
    for (final WebConfig webConfig : webConfigList) {
        final String sid = ComponentUtil.getCrawlingConfigHelper().store(sessionId, webConfig);
        // create crawler
        final Crawler crawler = ComponentUtil.getComponent(Crawler.class);
        crawler.setSessionId(sid);
        sessionIdList.add(sid);
        final String urlsStr = webConfig.getUrls();
        if (StringUtil.isBlank(urlsStr)) {
            logger.warn("No target urls. Skipped");
            break;
        }
        // interval time
        final int intervalTime = webConfig.getIntervalTime() != null ? webConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_WEB;
        ((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
        final String includedUrlsStr = webConfig.getIncludedUrls() != null ? webConfig.getIncludedUrls() : StringUtil.EMPTY;
        final String excludedUrlsStr = webConfig.getExcludedUrls() != null ? webConfig.getExcludedUrls() : StringUtil.EMPTY;
        // num of threads
        final CrawlerContext crawlerContext = crawler.getCrawlerContext();
        final int numOfThread = webConfig.getNumOfThread() != null ? webConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB;
        crawlerContext.setNumOfThread(numOfThread);
        // depth
        final int depth = webConfig.getDepth() != null ? webConfig.getDepth() : -1;
        crawlerContext.setMaxDepth(depth);
        // max count
        final long maxCount = webConfig.getMaxAccessCount() != null ? webConfig.getMaxAccessCount() : maxAccessCount;
        crawlerContext.setMaxAccessCount(maxCount);
        webConfig.initializeClientFactory(() -> crawler.getClientFactory());
        final Map<String, String> configParamMap = webConfig.getConfigParameterMap(ConfigName.CONFIG);
        if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_ALL))) {
            deleteCrawlData(sid);
        } else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_URL_FILTERS))) {
            final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class);
            try {
                urlFilterService.delete(sid);
            } catch (final Exception e) {
                logger.warn("Failed to delete url filters for {}", sid);
            }
        }
        final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper();
        // set urls
        split(urlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(urlValue -> {
            if (!urlValue.startsWith("#") && fessConfig.isValidCrawlerWebProtocol(urlValue)) {
                final String u = duplicateHostHelper.convert(urlValue);
                crawler.addUrl(u);
                if (logger.isInfoEnabled()) {
                    logger.info("Target URL: {}", u);
                }
            }
        }));
        // set included urls
        split(includedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(urlValue -> {
            if (!urlValue.startsWith("#")) {
                crawler.addIncludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Included URL: {}", urlValue);
                }
            }
        }));
        // set excluded urls
        split(excludedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(urlValue -> {
            if (!urlValue.startsWith("#")) {
                crawler.addExcludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Excluded URL: {}", urlValue);
                }
            }
        }));
        // failure url
        final List<String> excludedUrlList = ComponentUtil.getCrawlingConfigHelper().getExcludedUrlList(webConfig.getConfigId());
        if (excludedUrlList != null) {
            excludedUrlList.stream().filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(u -> {
                final String urlValue = Pattern.quote(u);
                crawler.addExcludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Excluded URL from failures: {}", urlValue);
                }
            });
        }
        if (logger.isDebugEnabled()) {
            logger.debug("Crawling {}", urlsStr);
        }
        crawler.setBackground(true);
        crawler.setThreadPriority(crawlerPriority);
        crawlerList.add(crawler);
        crawlerStatusList.add(Constants.READY);
    }
    // File
    for (final FileConfig fileConfig : fileConfigList) {
        final String sid = ComponentUtil.getCrawlingConfigHelper().store(sessionId, fileConfig);
        // create crawler
        final Crawler crawler = ComponentUtil.getComponent(Crawler.class);
        crawler.setSessionId(sid);
        sessionIdList.add(sid);
        final String pathsStr = fileConfig.getPaths();
        if (StringUtil.isBlank(pathsStr)) {
            logger.warn("No target uris. Skipped");
            break;
        }
        final int intervalTime = fileConfig.getIntervalTime() != null ? fileConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_FS;
        ((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
        final String includedPathsStr = fileConfig.getIncludedPaths() != null ? fileConfig.getIncludedPaths() : StringUtil.EMPTY;
        final String excludedPathsStr = fileConfig.getExcludedPaths() != null ? fileConfig.getExcludedPaths() : StringUtil.EMPTY;
        // num of threads
        final CrawlerContext crawlerContext = crawler.getCrawlerContext();
        final int numOfThread = fileConfig.getNumOfThread() != null ? fileConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_FS;
        crawlerContext.setNumOfThread(numOfThread);
        // depth
        final int depth = fileConfig.getDepth() != null ? fileConfig.getDepth() : -1;
        crawlerContext.setMaxDepth(depth);
        // max count
        final long maxCount = fileConfig.getMaxAccessCount() != null ? fileConfig.getMaxAccessCount() : maxAccessCount;
        crawlerContext.setMaxAccessCount(maxCount);
        fileConfig.initializeClientFactory(() -> crawler.getClientFactory());
        final Map<String, String> configParamMap = fileConfig.getConfigParameterMap(ConfigName.CONFIG);
        if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_ALL))) {
            deleteCrawlData(sid);
        } else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_URL_FILTERS))) {
            final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class);
            try {
                urlFilterService.delete(sid);
            } catch (final Exception e) {
                logger.warn("Failed to delete url filters for {}", sid);
            }
        }
        // set paths
        split(pathsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(urlValue -> {
            if (!urlValue.startsWith("#")) {
                final String u;
                if (!fessConfig.isValidCrawlerFileProtocol(urlValue)) {
                    if (urlValue.startsWith("/")) {
                        u = "file:" + urlValue;
                    } else {
                        u = "file:/" + urlValue;
                    }
                } else {
                    u = urlValue;
                }
                crawler.addUrl(u);
                if (logger.isInfoEnabled()) {
                    logger.info("Target Path: {}", u);
                }
            }
        }));
        // set included paths
        final AtomicBoolean urlEncodeDisabled = new AtomicBoolean(false);
        split(includedPathsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(line -> {
            if (!line.startsWith("#")) {
                final String urlValue;
                if (urlEncodeDisabled.get()) {
                    urlValue = line;
                    urlEncodeDisabled.set(false);
                } else {
                    urlValue = systemHelper.encodeUrlFilter(line);
                }
                crawler.addIncludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Included Path: {}", urlValue);
                }
            } else if (line.startsWith("#DISABLE_URL_ENCODE")) {
                urlEncodeDisabled.set(true);
            }
        }));
        // set excluded paths
        urlEncodeDisabled.set(false);
        split(excludedPathsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(line -> {
            if (!line.startsWith("#")) {
                final String urlValue;
                if (urlEncodeDisabled.get()) {
                    urlValue = line;
                    urlEncodeDisabled.set(false);
                } else {
                    urlValue = systemHelper.encodeUrlFilter(line);
                }
                crawler.addExcludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Excluded Path: {}", urlValue);
                }
            } else if (line.startsWith("#DISABLE_URL_ENCODE")) {
                urlEncodeDisabled.set(true);
            }
        }));
        // failure url
        final List<String> excludedUrlList = ComponentUtil.getCrawlingConfigHelper().getExcludedUrlList(fileConfig.getConfigId());
        if (excludedUrlList != null) {
            excludedUrlList.stream().filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(u -> {
                final String urlValue = Pattern.quote(u);
                crawler.addExcludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Excluded Path from failures: {}", urlValue);
                }
            });
        }
        if (logger.isDebugEnabled()) {
            logger.debug("Crawling {}", pathsStr);
        }
        crawler.setBackground(true);
        crawler.setThreadPriority(crawlerPriority);
        crawlerList.add(crawler);
        crawlerStatusList.add(Constants.READY);
    }
    // run index update
    final IndexUpdater indexUpdater = ComponentUtil.getIndexUpdater();
    indexUpdater.setName("IndexUpdater");
    indexUpdater.setPriority(indexUpdaterPriority);
    indexUpdater.setSessionIdList(sessionIdList);
    indexUpdater.setDaemon(true);
    indexUpdater.setCrawlerList(crawlerList);
    getAvailableBoostDocumentRuleList().forEach(rule -> {
        indexUpdater.addDocBoostMatcher(new org.codelibs.fess.indexer.DocBoostMatcher(rule));
    });
    indexUpdater.start();
    int startedCrawlerNum = 0;
    int activeCrawlerNum = 0;
    while (startedCrawlerNum < crawlerList.size()) {
        // Force to stop crawl
        if (systemHelper.isForceStop()) {
            for (final Crawler crawler : crawlerList) {
                crawler.stop();
            }
            break;
        }
        if (activeCrawlerNum < multiprocessCrawlingCount) {
            // start crawling
            crawlerList.get(startedCrawlerNum).execute();
            crawlerStatusList.set(startedCrawlerNum, Constants.RUNNING);
            startedCrawlerNum++;
            activeCrawlerNum++;
            ThreadUtil.sleep(crawlingExecutionInterval);
            continue;
        }
        // check status
        for (int i = 0; i < startedCrawlerNum; i++) {
            if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE && Constants.RUNNING.equals(crawlerStatusList.get(i))) {
                crawlerList.get(i).awaitTermination();
                crawlerStatusList.set(i, Constants.DONE);
                final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
                indexUpdater.addFinishedSessionId(sid);
                activeCrawlerNum--;
            }
        }
        ThreadUtil.sleep(crawlingExecutionInterval);
    }
    boolean finishedAll = false;
    while (!finishedAll) {
        finishedAll = true;
        for (int i = 0; i < crawlerList.size(); i++) {
            crawlerList.get(i).awaitTermination(crawlingExecutionInterval);
            if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE && !Constants.DONE.equals(crawlerStatusList.get(i))) {
                crawlerStatusList.set(i, Constants.DONE);
                final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
                indexUpdater.addFinishedSessionId(sid);
            }
            if (!Constants.DONE.equals(crawlerStatusList.get(i))) {
                finishedAll = false;
            }
        }
    }
    crawlerList.clear();
    crawlerStatusList.clear();
    // put cralwing info
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final long execTime = System.currentTimeMillis() - startTime;
    crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_CRAWLING_EXEC_TIME, Long.toString(execTime));
    if (logger.isInfoEnabled()) {
        logger.info("[EXEC TIME] crawling time: {}ms", execTime);
    }
    indexUpdater.setFinishCrawling(true);
    try {
        indexUpdater.join();
    } catch (final InterruptedException e) {
        logger.warn("Interrupted index update.", e);
    }
    crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_EXEC_TIME, Long.toString(indexUpdater.getExecuteTime()));
    crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_SIZE, Long.toString(indexUpdater.getDocumentSize()));
    if (systemHelper.isForceStop()) {
        return;
    }
    for (final String sid : sessionIdList) {
        // remove config
        ComponentUtil.getCrawlingConfigHelper().remove(sid);
        deleteCrawlData(sid);
    }
}
Also used : ThreadUtil(org.codelibs.core.lang.ThreadUtil) Constants(org.codelibs.fess.Constants) CrawlerContext(org.codelibs.fess.crawler.CrawlerContext) BoostDocumentRuleBhv(org.codelibs.fess.es.config.exbhv.BoostDocumentRuleBhv) EsUrlQueueService(org.codelibs.fess.crawler.service.impl.EsUrlQueueService) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) EsDataService(org.codelibs.fess.crawler.service.impl.EsDataService) IndexUpdater(org.codelibs.fess.indexer.IndexUpdater) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) ArrayList(java.util.ArrayList) CrawlerStatus(org.codelibs.fess.crawler.CrawlerStatus) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) StreamUtil.split(org.codelibs.core.stream.StreamUtil.split) Map(java.util.Map) Config(org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config) FileConfig(org.codelibs.fess.es.config.exentity.FileConfig) WebConfig(org.codelibs.fess.es.config.exentity.WebConfig) Crawler(org.codelibs.fess.crawler.Crawler) EsUrlFilterService(org.codelibs.fess.crawler.service.impl.EsUrlFilterService) StringUtil(org.codelibs.core.lang.StringUtil) BoostDocumentRule(org.codelibs.fess.es.config.exentity.BoostDocumentRule) List(java.util.List) Logger(org.apache.logging.log4j.Logger) ComponentUtil(org.codelibs.fess.util.ComponentUtil) Pattern(java.util.regex.Pattern) Collections(java.util.Collections) LogManager(org.apache.logging.log4j.LogManager) FessIntervalController(org.codelibs.fess.crawler.interval.FessIntervalController) ArrayList(java.util.ArrayList) WebConfig(org.codelibs.fess.es.config.exentity.WebConfig) FessIntervalController(org.codelibs.fess.crawler.interval.FessIntervalController) EsUrlFilterService(org.codelibs.fess.crawler.service.impl.EsUrlFilterService) StringUtil(org.codelibs.core.lang.StringUtil) IndexUpdater(org.codelibs.fess.indexer.IndexUpdater) FileConfig(org.codelibs.fess.es.config.exentity.FileConfig) Crawler(org.codelibs.fess.crawler.Crawler) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) CrawlerContext(org.codelibs.fess.crawler.CrawlerContext) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean)

Aggregations

ArrayList (java.util.ArrayList)1 Collections (java.util.Collections)1 List (java.util.List)1 Map (java.util.Map)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 Pattern (java.util.regex.Pattern)1 LogManager (org.apache.logging.log4j.LogManager)1 Logger (org.apache.logging.log4j.Logger)1 StringUtil (org.codelibs.core.lang.StringUtil)1 ThreadUtil (org.codelibs.core.lang.ThreadUtil)1 StreamUtil.split (org.codelibs.core.stream.StreamUtil.split)1 Constants (org.codelibs.fess.Constants)1 Crawler (org.codelibs.fess.crawler.Crawler)1 CrawlerContext (org.codelibs.fess.crawler.CrawlerContext)1 CrawlerStatus (org.codelibs.fess.crawler.CrawlerStatus)1 FessIntervalController (org.codelibs.fess.crawler.interval.FessIntervalController)1 EsDataService (org.codelibs.fess.crawler.service.impl.EsDataService)1 EsUrlFilterService (org.codelibs.fess.crawler.service.impl.EsUrlFilterService)1 EsUrlQueueService (org.codelibs.fess.crawler.service.impl.EsUrlQueueService)1 BoostDocumentRuleBhv (org.codelibs.fess.es.config.exbhv.BoostDocumentRuleBhv)1