Search in sources :

Example 1 with Crawler

use of org.codelibs.fess.crawler.Crawler in project fess by codelibs.

the class WebFsIndexHelper method doCrawl.

protected void doCrawl(final String sessionId, final List<WebConfig> webConfigList, final List<FileConfig> fileConfigList) {
    final int multiprocessCrawlingCount = ComponentUtil.getFessConfig().getCrawlingThreadCount();
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final long startTime = System.currentTimeMillis();
    final List<String> sessionIdList = new ArrayList<>();
    crawlerList.clear();
    final List<String> crawlerStatusList = new ArrayList<>();
    // Web
    for (final WebConfig webConfig : webConfigList) {
        final String sid = crawlingConfigHelper.store(sessionId, webConfig);
        // create crawler
        final Crawler crawler = ComponentUtil.getComponent(Crawler.class);
        crawler.setSessionId(sid);
        sessionIdList.add(sid);
        final String urlsStr = webConfig.getUrls();
        if (StringUtil.isBlank(urlsStr)) {
            logger.warn("No target urls. Skipped");
            break;
        }
        // interval time
        final int intervalTime = webConfig.getIntervalTime() != null ? webConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_WEB;
        ((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
        final String includedUrlsStr = webConfig.getIncludedUrls() != null ? webConfig.getIncludedUrls() : StringUtil.EMPTY;
        final String excludedUrlsStr = webConfig.getExcludedUrls() != null ? webConfig.getExcludedUrls() : StringUtil.EMPTY;
        // num of threads
        final CrawlerContext crawlerContext = crawler.getCrawlerContext();
        final int numOfThread = webConfig.getNumOfThread() != null ? webConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB;
        crawlerContext.setNumOfThread(numOfThread);
        // depth
        final int depth = webConfig.getDepth() != null ? webConfig.getDepth() : -1;
        crawlerContext.setMaxDepth(depth);
        // max count
        final long maxCount = webConfig.getMaxAccessCount() != null ? webConfig.getMaxAccessCount() : maxAccessCount;
        crawlerContext.setMaxAccessCount(maxCount);
        webConfig.initializeClientFactory(crawler.getClientFactory());
        final Map<String, String> configParamMap = webConfig.getConfigParameterMap(ConfigName.CONFIG);
        if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_ALL))) {
            deleteCrawlData(sid);
        } else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_FILTERS))) {
            final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class);
            try {
                urlFilterService.delete(sid);
            } catch (final Exception e) {
                logger.warn("Failed to delete url filters for " + sid);
            }
        }
        // set urls
        final String[] urls = urlsStr.split("[\r\n]");
        for (final String u : urls) {
            if (StringUtil.isNotBlank(u)) {
                final String urlValue = u.trim();
                if (!urlValue.startsWith("#") && fessConfig.isValidCrawlerWebProtocol(u)) {
                    crawler.addUrl(urlValue);
                    if (logger.isInfoEnabled()) {
                        logger.info("Target URL: " + urlValue);
                    }
                }
            }
        }
        // set included urls
        final String[] includedUrls = includedUrlsStr.split("[\r\n]");
        for (final String u : includedUrls) {
            if (StringUtil.isNotBlank(u)) {
                final String urlValue = u.trim();
                if (!urlValue.startsWith("#")) {
                    crawler.addIncludeFilter(urlValue);
                    if (logger.isInfoEnabled()) {
                        logger.info("Included URL: " + urlValue);
                    }
                }
            }
        }
        // set excluded urls
        final String[] excludedUrls = excludedUrlsStr.split("[\r\n]");
        for (final String u : excludedUrls) {
            if (StringUtil.isNotBlank(u)) {
                final String urlValue = u.trim();
                if (!urlValue.startsWith("#")) {
                    crawler.addExcludeFilter(urlValue);
                    if (logger.isInfoEnabled()) {
                        logger.info("Excluded URL: " + urlValue);
                    }
                }
            }
        }
        // failure url
        final List<String> excludedUrlList = failureUrlService.getExcludedUrlList(webConfig.getConfigId());
        for (final String u : excludedUrlList) {
            if (StringUtil.isNotBlank(u)) {
                final String urlValue = u.trim();
                crawler.addExcludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Excluded URL from failures: " + urlValue);
                }
            }
        }
        if (logger.isDebugEnabled()) {
            logger.debug("Crawling " + urlsStr);
        }
        crawler.setBackground(true);
        crawler.setThreadPriority(crawlerPriority);
        crawlerList.add(crawler);
        crawlerStatusList.add(Constants.READY);
    }
    // File
    for (final FileConfig fileConfig : fileConfigList) {
        final String sid = crawlingConfigHelper.store(sessionId, fileConfig);
        // create crawler
        final Crawler crawler = ComponentUtil.getComponent(Crawler.class);
        crawler.setSessionId(sid);
        sessionIdList.add(sid);
        final String pathsStr = fileConfig.getPaths();
        if (StringUtil.isBlank(pathsStr)) {
            logger.warn("No target uris. Skipped");
            break;
        }
        final int intervalTime = fileConfig.getIntervalTime() != null ? fileConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_FS;
        ((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
        final String includedPathsStr = fileConfig.getIncludedPaths() != null ? fileConfig.getIncludedPaths() : StringUtil.EMPTY;
        final String excludedPathsStr = fileConfig.getExcludedPaths() != null ? fileConfig.getExcludedPaths() : StringUtil.EMPTY;
        // num of threads
        final CrawlerContext crawlerContext = crawler.getCrawlerContext();
        final int numOfThread = fileConfig.getNumOfThread() != null ? fileConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_FS;
        crawlerContext.setNumOfThread(numOfThread);
        // depth
        final int depth = fileConfig.getDepth() != null ? fileConfig.getDepth() : -1;
        crawlerContext.setMaxDepth(depth);
        // max count
        final long maxCount = fileConfig.getMaxAccessCount() != null ? fileConfig.getMaxAccessCount() : maxAccessCount;
        crawlerContext.setMaxAccessCount(maxCount);
        fileConfig.initializeClientFactory(crawler.getClientFactory());
        final Map<String, String> configParamMap = fileConfig.getConfigParameterMap(ConfigName.CONFIG);
        if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_ALL))) {
            deleteCrawlData(sid);
        } else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Constants.CONFIG_CLEANUP_FILTERS))) {
            final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class);
            try {
                urlFilterService.delete(sid);
            } catch (final Exception e) {
                logger.warn("Failed to delete url filters for " + sid);
            }
        }
        // set paths
        final String[] paths = pathsStr.split("[\r\n]");
        for (String u : paths) {
            if (StringUtil.isNotBlank(u)) {
                u = u.trim();
                if (!u.startsWith("#")) {
                    if (!fessConfig.isValidCrawlerFileProtocol(u)) {
                        if (u.startsWith("/")) {
                            u = "file:" + u;
                        } else {
                            u = "file:/" + u;
                        }
                    }
                    crawler.addUrl(u);
                    if (logger.isInfoEnabled()) {
                        logger.info("Target Path: " + u);
                    }
                }
            }
        }
        // set included paths
        boolean urlEncodeDisabled = false;
        final String[] includedPaths = includedPathsStr.split("[\r\n]");
        for (final String u : includedPaths) {
            if (StringUtil.isNotBlank(u)) {
                final String line = u.trim();
                if (!line.startsWith("#")) {
                    final String urlValue;
                    if (urlEncodeDisabled) {
                        urlValue = line;
                        urlEncodeDisabled = false;
                    } else {
                        urlValue = systemHelper.encodeUrlFilter(line);
                    }
                    crawler.addIncludeFilter(urlValue);
                    if (logger.isInfoEnabled()) {
                        logger.info("Included Path: " + urlValue);
                    }
                } else if (line.startsWith("#DISABLE_URL_ENCODE")) {
                    urlEncodeDisabled = true;
                }
            }
        }
        // set excluded paths
        urlEncodeDisabled = false;
        final String[] excludedPaths = excludedPathsStr.split("[\r\n]");
        for (final String u : excludedPaths) {
            if (StringUtil.isNotBlank(u)) {
                final String line = u.trim();
                if (!line.startsWith("#")) {
                    final String urlValue;
                    if (urlEncodeDisabled) {
                        urlValue = line;
                        urlEncodeDisabled = false;
                    } else {
                        urlValue = systemHelper.encodeUrlFilter(line);
                    }
                    crawler.addExcludeFilter(urlValue);
                    if (logger.isInfoEnabled()) {
                        logger.info("Excluded Path: " + urlValue);
                    }
                } else if (line.startsWith("#DISABLE_URL_ENCODE")) {
                    urlEncodeDisabled = true;
                }
            }
        }
        // failure url
        final List<String> excludedUrlList = failureUrlService.getExcludedUrlList(fileConfig.getConfigId());
        if (excludedUrlList != null) {
            for (final String u : excludedUrlList) {
                if (StringUtil.isNotBlank(u)) {
                    final String urlValue = u.trim();
                    crawler.addExcludeFilter(urlValue);
                    if (logger.isInfoEnabled()) {
                        logger.info("Excluded Path from failures: " + urlValue);
                    }
                }
            }
        }
        if (logger.isDebugEnabled()) {
            logger.debug("Crawling " + pathsStr);
        }
        crawler.setBackground(true);
        crawler.setThreadPriority(crawlerPriority);
        crawlerList.add(crawler);
        crawlerStatusList.add(Constants.READY);
    }
    // run index update
    final IndexUpdater indexUpdater = ComponentUtil.getIndexUpdater();
    indexUpdater.setName("IndexUpdater");
    indexUpdater.setPriority(indexUpdaterPriority);
    indexUpdater.setSessionIdList(sessionIdList);
    indexUpdater.setDaemon(true);
    indexUpdater.setCrawlerList(crawlerList);
    boostDocumentRuleService.getAvailableBoostDocumentRuleList().forEach(rule -> {
        indexUpdater.addDocBoostMatcher(new org.codelibs.fess.indexer.DocBoostMatcher(rule));
    });
    indexUpdater.start();
    int startedCrawlerNum = 0;
    int activeCrawlerNum = 0;
    while (startedCrawlerNum < crawlerList.size()) {
        // Force to stop crawl
        if (systemHelper.isForceStop()) {
            for (final Crawler crawler : crawlerList) {
                crawler.stop();
            }
            break;
        }
        if (activeCrawlerNum < multiprocessCrawlingCount) {
            // start crawling
            crawlerList.get(startedCrawlerNum).execute();
            crawlerStatusList.set(startedCrawlerNum, Constants.RUNNING);
            startedCrawlerNum++;
            activeCrawlerNum++;
            try {
                Thread.sleep(crawlingExecutionInterval);
            } catch (final InterruptedException e) {
                if (logger.isDebugEnabled()) {
                    logger.debug("Interrupted.", e);
                }
            }
            continue;
        }
        // check status
        for (int i = 0; i < startedCrawlerNum; i++) {
            if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE && crawlerStatusList.get(i).equals(Constants.RUNNING)) {
                crawlerList.get(i).awaitTermination();
                crawlerStatusList.set(i, Constants.DONE);
                final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
                indexUpdater.addFinishedSessionId(sid);
                activeCrawlerNum--;
            }
        }
        try {
            Thread.sleep(crawlingExecutionInterval);
        } catch (final InterruptedException e) {
            if (logger.isDebugEnabled()) {
                logger.debug("Interrupted.", e);
            }
        }
    }
    boolean finishedAll = false;
    while (!finishedAll) {
        finishedAll = true;
        for (int i = 0; i < crawlerList.size(); i++) {
            crawlerList.get(i).awaitTermination(crawlingExecutionInterval);
            if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE && !crawlerStatusList.get(i).equals(Constants.DONE)) {
                crawlerStatusList.set(i, Constants.DONE);
                final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
                indexUpdater.addFinishedSessionId(sid);
            }
            if (!crawlerStatusList.get(i).equals(Constants.DONE)) {
                finishedAll = false;
            }
        }
    }
    crawlerList.clear();
    crawlerStatusList.clear();
    // put cralwing info
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final long execTime = System.currentTimeMillis() - startTime;
    crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_CRAWLING_EXEC_TIME, Long.toString(execTime));
    if (logger.isInfoEnabled()) {
        logger.info("[EXEC TIME] crawling time: " + execTime + "ms");
    }
    indexUpdater.setFinishCrawling(true);
    try {
        indexUpdater.join();
    } catch (final InterruptedException e) {
        logger.warn("Interrupted index update.", e);
    }
    crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_EXEC_TIME, Long.toString(indexUpdater.getExecuteTime()));
    crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_SIZE, Long.toString(indexUpdater.getDocumentSize()));
    if (systemHelper.isForceStop()) {
        return;
    }
    for (final String sid : sessionIdList) {
        // remove config
        crawlingConfigHelper.remove(sid);
        deleteCrawlData(sid);
    }
}
Also used : FileConfig(org.codelibs.fess.es.config.exentity.FileConfig) ArrayList(java.util.ArrayList) WebConfig(org.codelibs.fess.es.config.exentity.WebConfig) FessIntervalController(org.codelibs.fess.crawler.interval.FessIntervalController) Crawler(org.codelibs.fess.crawler.Crawler) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) CrawlerContext(org.codelibs.fess.crawler.CrawlerContext) EsUrlFilterService(org.codelibs.fess.crawler.service.impl.EsUrlFilterService) IndexUpdater(org.codelibs.fess.indexer.IndexUpdater)

Aggregations

ArrayList (java.util.ArrayList)1 Crawler (org.codelibs.fess.crawler.Crawler)1 CrawlerContext (org.codelibs.fess.crawler.CrawlerContext)1 FessIntervalController (org.codelibs.fess.crawler.interval.FessIntervalController)1 EsUrlFilterService (org.codelibs.fess.crawler.service.impl.EsUrlFilterService)1 FileConfig (org.codelibs.fess.es.config.exentity.FileConfig)1 WebConfig (org.codelibs.fess.es.config.exentity.WebConfig)1 IndexUpdater (org.codelibs.fess.indexer.IndexUpdater)1 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)1