Search in sources :

Example 1 with Config

use of org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config in project fess by codelibs.

the class FessXpathTransformer method putAdditionalData.

protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
    // canonical
    final String canonicalUrl = getCanonicalUrl(responseData, document);
    if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl) && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
        final Set<RequestData> childUrlSet = new HashSet<>();
        childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
        logger.info("CANONICAL: {} -> {}", responseData.getUrl(), canonicalUrl);
        throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData");
    }
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
    final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
    final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
    String url = responseData.getUrl();
    final String indexingTarget = crawlingConfig.getIndexingTarget(url);
    url = pathMappingHelper.replaceUrl(sessionId, url);
    final String mimeType = responseData.getMimeType();
    final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
    final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
    String urlEncoding;
    final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
    if (urlQueue != null && urlQueue.getEncoding() != null) {
        urlEncoding = urlQueue.getEncoding();
    } else {
        urlEncoding = responseData.getCharSet();
    }
    // cid
    final String configId = crawlingConfig.getConfigId();
    if (configId != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
    }
    // expires
    if (documentExpires != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // lang
    final String lang = systemHelper.normalizeHtmlLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
    if (lang != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
    }
    // title
    // content
    final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
    putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(crawlingConfig, responseData, body, dataMap));
    if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
        if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
            String charSet = responseData.getCharSet();
            if (charSet == null) {
                charSet = Constants.UTF_8;
            }
            try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
                // cache
                putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
                putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
            } catch (final Exception e) {
                logger.warn("Failed to write a cache: {}:{}", sessionId, responseData, e);
            }
        } else {
            logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
        }
    }
    // digest
    final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
    if (StringUtil.isNotBlank(digest)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
    } else {
        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
    }
    // segment
    putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
    // host
    putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
    // site
    putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
    // filename
    final String fileName = getFileName(url, urlEncoding);
    if (StringUtil.isNotBlank(fileName)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
    }
    // url
    putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    // created
    final Date now = systemHelper.getCurrentTime();
    putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
    // anchor
    putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
    // mimetype
    putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
    if (fileTypeHelper != null) {
        // filetype
        putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
    }
    // content_length
    putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
    // last_modified
    final Date lastModified = responseData.getLastModified();
    if (lastModified != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
    } else {
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
    }
    // indexingTarget
    putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
    // boost
    putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
    // label: labelType
    putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeHelper.getMatchedLabelValueSet(url));
    // role: roleType
    final List<String> roleTypeList = new ArrayList<>();
    stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
    // virtualHosts
    putResultDataBody(dataMap, fessConfig.getIndexFieldVirtualHost(), stream(crawlingConfig.getVirtualHosts()).get(stream -> stream.filter(StringUtil::isNotBlank).collect(Collectors.toList())));
    // id
    putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    // parentId
    String parentUrl = responseData.getParentUrl();
    if (StringUtil.isNotBlank(parentUrl)) {
        parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
        // set again
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    }
    // thumbnail
    final String thumbnailUrl = getThumbnailUrl(responseData, document);
    if (StringUtil.isNotBlank(thumbnailUrl)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
    }
    // from config
    final String scriptType = crawlingConfig.getScriptType();
    final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
    xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
        final String key = e.getKey();
        final String value = getSingleNodeValue(document, e.getValue(), true);
        putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
    });
    crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
        final String key = e.getKey();
        final String value = e.getValue();
        putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
    });
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) BufferedInputStream(java.io.BufferedInputStream) URL(java.net.URL) Date(java.util.Date) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Locale(java.util.Locale) Document(org.w3c.dom.Document) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) Config(org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config) InputStreamUtil(org.codelibs.core.io.InputStreamUtil) ResultData(org.codelibs.fess.crawler.entity.ResultData) Set(java.util.Set) Collectors(java.util.stream.Collectors) List(java.util.List) Logger(org.apache.logging.log4j.Logger) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) PostConstruct(javax.annotation.PostConstruct) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) XObject(org.apache.xpath.objects.XObject) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) TransformerException(javax.xml.transform.TransformerException) HashMap(java.util.HashMap) SerializeUtil(org.codelibs.core.io.SerializeUtil) CrawlingParameterUtil(org.codelibs.fess.crawler.util.CrawlingParameterUtil) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) XpathTransformer(org.codelibs.fess.crawler.transformer.impl.XpathTransformer) Node(org.w3c.dom.Node) PrunedTag(org.codelibs.fess.util.PrunedTag) NamedNodeMap(org.w3c.dom.NamedNodeMap) InputSource(org.xml.sax.InputSource) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) XPath(org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.XPath) NodeList(org.w3c.dom.NodeList) MalformedURLException(java.net.MalformedURLException) StringUtil(org.codelibs.core.lang.StringUtil) DuplicateHostHelper(org.codelibs.fess.helper.DuplicateHostHelper) ValueHolder(org.codelibs.core.misc.ValueHolder) SystemHelper(org.codelibs.fess.helper.SystemHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) DOMParser(org.codelibs.nekohtml.parsers.DOMParser) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Collections(java.util.Collections) LogManager(org.apache.logging.log4j.LogManager) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) TransformerException(javax.xml.transform.TransformerException) MalformedURLException(java.net.MalformedURLException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) BufferedInputStream(java.io.BufferedInputStream) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) HashSet(java.util.HashSet)

Example 2 with Config

use of org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config in project fess by codelibs.

the class WebFsIndexHelper method doCrawl.

protected void doCrawl(final String sessionId, final List<WebConfig> webConfigList, final List<FileConfig> fileConfigList) {
    final int multiprocessCrawlingCount = ComponentUtil.getFessConfig().getCrawlingThreadCount();
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final long startTime = System.currentTimeMillis();
    final List<String> sessionIdList = new ArrayList<>();
    crawlerList.clear();
    final List<String> crawlerStatusList = new ArrayList<>();
    // Web
    for (final WebConfig webConfig : webConfigList) {
        final String sid = ComponentUtil.getCrawlingConfigHelper().store(sessionId, webConfig);
        // create crawler
        final Crawler crawler = ComponentUtil.getComponent(Crawler.class);
        crawler.setSessionId(sid);
        sessionIdList.add(sid);
        final String urlsStr = webConfig.getUrls();
        if (StringUtil.isBlank(urlsStr)) {
            logger.warn("No target urls. Skipped");
            break;
        }
        // interval time
        final int intervalTime = webConfig.getIntervalTime() != null ? webConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_WEB;
        ((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
        final String includedUrlsStr = webConfig.getIncludedUrls() != null ? webConfig.getIncludedUrls() : StringUtil.EMPTY;
        final String excludedUrlsStr = webConfig.getExcludedUrls() != null ? webConfig.getExcludedUrls() : StringUtil.EMPTY;
        // num of threads
        final CrawlerContext crawlerContext = crawler.getCrawlerContext();
        final int numOfThread = webConfig.getNumOfThread() != null ? webConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB;
        crawlerContext.setNumOfThread(numOfThread);
        // depth
        final int depth = webConfig.getDepth() != null ? webConfig.getDepth() : -1;
        crawlerContext.setMaxDepth(depth);
        // max count
        final long maxCount = webConfig.getMaxAccessCount() != null ? webConfig.getMaxAccessCount() : maxAccessCount;
        crawlerContext.setMaxAccessCount(maxCount);
        webConfig.initializeClientFactory(() -> crawler.getClientFactory());
        final Map<String, String> configParamMap = webConfig.getConfigParameterMap(ConfigName.CONFIG);
        if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_ALL))) {
            deleteCrawlData(sid);
        } else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_URL_FILTERS))) {
            final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class);
            try {
                urlFilterService.delete(sid);
            } catch (final Exception e) {
                logger.warn("Failed to delete url filters for {}", sid);
            }
        }
        final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper();
        // set urls
        split(urlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(urlValue -> {
            if (!urlValue.startsWith("#") && fessConfig.isValidCrawlerWebProtocol(urlValue)) {
                final String u = duplicateHostHelper.convert(urlValue);
                crawler.addUrl(u);
                if (logger.isInfoEnabled()) {
                    logger.info("Target URL: {}", u);
                }
            }
        }));
        // set included urls
        split(includedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(urlValue -> {
            if (!urlValue.startsWith("#")) {
                crawler.addIncludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Included URL: {}", urlValue);
                }
            }
        }));
        // set excluded urls
        split(excludedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(urlValue -> {
            if (!urlValue.startsWith("#")) {
                crawler.addExcludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Excluded URL: {}", urlValue);
                }
            }
        }));
        // failure url
        final List<String> excludedUrlList = ComponentUtil.getCrawlingConfigHelper().getExcludedUrlList(webConfig.getConfigId());
        if (excludedUrlList != null) {
            excludedUrlList.stream().filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(u -> {
                final String urlValue = Pattern.quote(u);
                crawler.addExcludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Excluded URL from failures: {}", urlValue);
                }
            });
        }
        if (logger.isDebugEnabled()) {
            logger.debug("Crawling {}", urlsStr);
        }
        crawler.setBackground(true);
        crawler.setThreadPriority(crawlerPriority);
        crawlerList.add(crawler);
        crawlerStatusList.add(Constants.READY);
    }
    // File
    for (final FileConfig fileConfig : fileConfigList) {
        final String sid = ComponentUtil.getCrawlingConfigHelper().store(sessionId, fileConfig);
        // create crawler
        final Crawler crawler = ComponentUtil.getComponent(Crawler.class);
        crawler.setSessionId(sid);
        sessionIdList.add(sid);
        final String pathsStr = fileConfig.getPaths();
        if (StringUtil.isBlank(pathsStr)) {
            logger.warn("No target uris. Skipped");
            break;
        }
        final int intervalTime = fileConfig.getIntervalTime() != null ? fileConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_FS;
        ((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
        final String includedPathsStr = fileConfig.getIncludedPaths() != null ? fileConfig.getIncludedPaths() : StringUtil.EMPTY;
        final String excludedPathsStr = fileConfig.getExcludedPaths() != null ? fileConfig.getExcludedPaths() : StringUtil.EMPTY;
        // num of threads
        final CrawlerContext crawlerContext = crawler.getCrawlerContext();
        final int numOfThread = fileConfig.getNumOfThread() != null ? fileConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_FS;
        crawlerContext.setNumOfThread(numOfThread);
        // depth
        final int depth = fileConfig.getDepth() != null ? fileConfig.getDepth() : -1;
        crawlerContext.setMaxDepth(depth);
        // max count
        final long maxCount = fileConfig.getMaxAccessCount() != null ? fileConfig.getMaxAccessCount() : maxAccessCount;
        crawlerContext.setMaxAccessCount(maxCount);
        fileConfig.initializeClientFactory(() -> crawler.getClientFactory());
        final Map<String, String> configParamMap = fileConfig.getConfigParameterMap(ConfigName.CONFIG);
        if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_ALL))) {
            deleteCrawlData(sid);
        } else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_URL_FILTERS))) {
            final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class);
            try {
                urlFilterService.delete(sid);
            } catch (final Exception e) {
                logger.warn("Failed to delete url filters for {}", sid);
            }
        }
        // set paths
        split(pathsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(urlValue -> {
            if (!urlValue.startsWith("#")) {
                final String u;
                if (!fessConfig.isValidCrawlerFileProtocol(urlValue)) {
                    if (urlValue.startsWith("/")) {
                        u = "file:" + urlValue;
                    } else {
                        u = "file:/" + urlValue;
                    }
                } else {
                    u = urlValue;
                }
                crawler.addUrl(u);
                if (logger.isInfoEnabled()) {
                    logger.info("Target Path: {}", u);
                }
            }
        }));
        // set included paths
        final AtomicBoolean urlEncodeDisabled = new AtomicBoolean(false);
        split(includedPathsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(line -> {
            if (!line.startsWith("#")) {
                final String urlValue;
                if (urlEncodeDisabled.get()) {
                    urlValue = line;
                    urlEncodeDisabled.set(false);
                } else {
                    urlValue = systemHelper.encodeUrlFilter(line);
                }
                crawler.addIncludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Included Path: {}", urlValue);
                }
            } else if (line.startsWith("#DISABLE_URL_ENCODE")) {
                urlEncodeDisabled.set(true);
            }
        }));
        // set excluded paths
        urlEncodeDisabled.set(false);
        split(excludedPathsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(line -> {
            if (!line.startsWith("#")) {
                final String urlValue;
                if (urlEncodeDisabled.get()) {
                    urlValue = line;
                    urlEncodeDisabled.set(false);
                } else {
                    urlValue = systemHelper.encodeUrlFilter(line);
                }
                crawler.addExcludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Excluded Path: {}", urlValue);
                }
            } else if (line.startsWith("#DISABLE_URL_ENCODE")) {
                urlEncodeDisabled.set(true);
            }
        }));
        // failure url
        final List<String> excludedUrlList = ComponentUtil.getCrawlingConfigHelper().getExcludedUrlList(fileConfig.getConfigId());
        if (excludedUrlList != null) {
            excludedUrlList.stream().filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(u -> {
                final String urlValue = Pattern.quote(u);
                crawler.addExcludeFilter(urlValue);
                if (logger.isInfoEnabled()) {
                    logger.info("Excluded Path from failures: {}", urlValue);
                }
            });
        }
        if (logger.isDebugEnabled()) {
            logger.debug("Crawling {}", pathsStr);
        }
        crawler.setBackground(true);
        crawler.setThreadPriority(crawlerPriority);
        crawlerList.add(crawler);
        crawlerStatusList.add(Constants.READY);
    }
    // run index update
    final IndexUpdater indexUpdater = ComponentUtil.getIndexUpdater();
    indexUpdater.setName("IndexUpdater");
    indexUpdater.setPriority(indexUpdaterPriority);
    indexUpdater.setSessionIdList(sessionIdList);
    indexUpdater.setDaemon(true);
    indexUpdater.setCrawlerList(crawlerList);
    getAvailableBoostDocumentRuleList().forEach(rule -> {
        indexUpdater.addDocBoostMatcher(new org.codelibs.fess.indexer.DocBoostMatcher(rule));
    });
    indexUpdater.start();
    int startedCrawlerNum = 0;
    int activeCrawlerNum = 0;
    while (startedCrawlerNum < crawlerList.size()) {
        // Force to stop crawl
        if (systemHelper.isForceStop()) {
            for (final Crawler crawler : crawlerList) {
                crawler.stop();
            }
            break;
        }
        if (activeCrawlerNum < multiprocessCrawlingCount) {
            // start crawling
            crawlerList.get(startedCrawlerNum).execute();
            crawlerStatusList.set(startedCrawlerNum, Constants.RUNNING);
            startedCrawlerNum++;
            activeCrawlerNum++;
            ThreadUtil.sleep(crawlingExecutionInterval);
            continue;
        }
        // check status
        for (int i = 0; i < startedCrawlerNum; i++) {
            if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE && Constants.RUNNING.equals(crawlerStatusList.get(i))) {
                crawlerList.get(i).awaitTermination();
                crawlerStatusList.set(i, Constants.DONE);
                final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
                indexUpdater.addFinishedSessionId(sid);
                activeCrawlerNum--;
            }
        }
        ThreadUtil.sleep(crawlingExecutionInterval);
    }
    boolean finishedAll = false;
    while (!finishedAll) {
        finishedAll = true;
        for (int i = 0; i < crawlerList.size(); i++) {
            crawlerList.get(i).awaitTermination(crawlingExecutionInterval);
            if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE && !Constants.DONE.equals(crawlerStatusList.get(i))) {
                crawlerStatusList.set(i, Constants.DONE);
                final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
                indexUpdater.addFinishedSessionId(sid);
            }
            if (!Constants.DONE.equals(crawlerStatusList.get(i))) {
                finishedAll = false;
            }
        }
    }
    crawlerList.clear();
    crawlerStatusList.clear();
    // put cralwing info
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final long execTime = System.currentTimeMillis() - startTime;
    crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_CRAWLING_EXEC_TIME, Long.toString(execTime));
    if (logger.isInfoEnabled()) {
        logger.info("[EXEC TIME] crawling time: {}ms", execTime);
    }
    indexUpdater.setFinishCrawling(true);
    try {
        indexUpdater.join();
    } catch (final InterruptedException e) {
        logger.warn("Interrupted index update.", e);
    }
    crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_EXEC_TIME, Long.toString(indexUpdater.getExecuteTime()));
    crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_SIZE, Long.toString(indexUpdater.getDocumentSize()));
    if (systemHelper.isForceStop()) {
        return;
    }
    for (final String sid : sessionIdList) {
        // remove config
        ComponentUtil.getCrawlingConfigHelper().remove(sid);
        deleteCrawlData(sid);
    }
}
Also used : ThreadUtil(org.codelibs.core.lang.ThreadUtil) Constants(org.codelibs.fess.Constants) CrawlerContext(org.codelibs.fess.crawler.CrawlerContext) BoostDocumentRuleBhv(org.codelibs.fess.es.config.exbhv.BoostDocumentRuleBhv) EsUrlQueueService(org.codelibs.fess.crawler.service.impl.EsUrlQueueService) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) EsDataService(org.codelibs.fess.crawler.service.impl.EsDataService) IndexUpdater(org.codelibs.fess.indexer.IndexUpdater) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) ArrayList(java.util.ArrayList) CrawlerStatus(org.codelibs.fess.crawler.CrawlerStatus) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) StreamUtil.split(org.codelibs.core.stream.StreamUtil.split) Map(java.util.Map) Config(org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config) FileConfig(org.codelibs.fess.es.config.exentity.FileConfig) WebConfig(org.codelibs.fess.es.config.exentity.WebConfig) Crawler(org.codelibs.fess.crawler.Crawler) EsUrlFilterService(org.codelibs.fess.crawler.service.impl.EsUrlFilterService) StringUtil(org.codelibs.core.lang.StringUtil) BoostDocumentRule(org.codelibs.fess.es.config.exentity.BoostDocumentRule) List(java.util.List) Logger(org.apache.logging.log4j.Logger) ComponentUtil(org.codelibs.fess.util.ComponentUtil) Pattern(java.util.regex.Pattern) Collections(java.util.Collections) LogManager(org.apache.logging.log4j.LogManager) FessIntervalController(org.codelibs.fess.crawler.interval.FessIntervalController) ArrayList(java.util.ArrayList) WebConfig(org.codelibs.fess.es.config.exentity.WebConfig) FessIntervalController(org.codelibs.fess.crawler.interval.FessIntervalController) EsUrlFilterService(org.codelibs.fess.crawler.service.impl.EsUrlFilterService) StringUtil(org.codelibs.core.lang.StringUtil) IndexUpdater(org.codelibs.fess.indexer.IndexUpdater) FileConfig(org.codelibs.fess.es.config.exentity.FileConfig) Crawler(org.codelibs.fess.crawler.Crawler) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) CrawlerContext(org.codelibs.fess.crawler.CrawlerContext) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean)

Aggregations

ArrayList (java.util.ArrayList)2 Collections (java.util.Collections)2 List (java.util.List)2 Map (java.util.Map)2 LogManager (org.apache.logging.log4j.LogManager)2 Logger (org.apache.logging.log4j.Logger)2 StringUtil (org.codelibs.core.lang.StringUtil)2 Constants (org.codelibs.fess.Constants)2 ConfigName (org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName)2 Config (org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config)2 BufferedInputStream (java.io.BufferedInputStream)1 MalformedURLException (java.net.MalformedURLException)1 URL (java.net.URL)1 Date (java.util.Date)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 LinkedHashMap (java.util.LinkedHashMap)1 Locale (java.util.Locale)1 Set (java.util.Set)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1