use of org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config in project fess by codelibs.
the class FessXpathTransformer method putAdditionalData.
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
// canonical
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl) && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
final Set<RequestData> childUrlSet = new HashSet<>();
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
logger.info("CANONICAL: {} -> {}", responseData.getUrl(), canonicalUrl);
throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData");
}
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
final String mimeType = responseData.getMimeType();
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
String urlEncoding;
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
if (urlQueue != null && urlQueue.getEncoding() != null) {
urlEncoding = urlQueue.getEncoding();
} else {
urlEncoding = responseData.getCharSet();
}
// cid
final String configId = crawlingConfig.getConfigId();
if (configId != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
}
// expires
if (documentExpires != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
}
// lang
final String lang = systemHelper.normalizeHtmlLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
if (lang != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
}
// title
// content
final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(crawlingConfig, responseData, body, dataMap));
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
String charSet = responseData.getCharSet();
if (charSet == null) {
charSet = Constants.UTF_8;
}
try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
// cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
} catch (final Exception e) {
logger.warn("Failed to write a cache: {}:{}", sessionId, responseData, e);
}
} else {
logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
}
}
// digest
final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
if (StringUtil.isNotBlank(digest)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
}
// segment
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
// host
putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
// site
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
// filename
final String fileName = getFileName(url, urlEncoding);
if (StringUtil.isNotBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
}
// url
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
// created
final Date now = systemHelper.getCurrentTime();
putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
// anchor
putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
// mimetype
putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
if (fileTypeHelper != null) {
// filetype
putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
}
// content_length
putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
// last_modified
final Date lastModified = responseData.getLastModified();
if (lastModified != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
} else {
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
}
// indexingTarget
putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
// boost
putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
// label: labelType
putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeHelper.getMatchedLabelValueSet(url));
// role: roleType
final List<String> roleTypeList = new ArrayList<>();
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
// virtualHosts
putResultDataBody(dataMap, fessConfig.getIndexFieldVirtualHost(), stream(crawlingConfig.getVirtualHosts()).get(stream -> stream.filter(StringUtil::isNotBlank).collect(Collectors.toList())));
// id
putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
// parentId
String parentUrl = responseData.getParentUrl();
if (StringUtil.isNotBlank(parentUrl)) {
parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
// set again
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
}
// thumbnail
final String thumbnailUrl = getThumbnailUrl(responseData, document);
if (StringUtil.isNotBlank(thumbnailUrl)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
}
// from config
final String scriptType = crawlingConfig.getScriptType();
final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
final String key = e.getKey();
final String value = getSingleNodeValue(document, e.getValue(), true);
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
});
crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
final String key = e.getKey();
final String value = e.getValue();
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
});
}
use of org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config in project fess by codelibs.
the class WebFsIndexHelper method doCrawl.
protected void doCrawl(final String sessionId, final List<WebConfig> webConfigList, final List<FileConfig> fileConfigList) {
final int multiprocessCrawlingCount = ComponentUtil.getFessConfig().getCrawlingThreadCount();
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final long startTime = System.currentTimeMillis();
final List<String> sessionIdList = new ArrayList<>();
crawlerList.clear();
final List<String> crawlerStatusList = new ArrayList<>();
// Web
for (final WebConfig webConfig : webConfigList) {
final String sid = ComponentUtil.getCrawlingConfigHelper().store(sessionId, webConfig);
// create crawler
final Crawler crawler = ComponentUtil.getComponent(Crawler.class);
crawler.setSessionId(sid);
sessionIdList.add(sid);
final String urlsStr = webConfig.getUrls();
if (StringUtil.isBlank(urlsStr)) {
logger.warn("No target urls. Skipped");
break;
}
// interval time
final int intervalTime = webConfig.getIntervalTime() != null ? webConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_WEB;
((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
final String includedUrlsStr = webConfig.getIncludedUrls() != null ? webConfig.getIncludedUrls() : StringUtil.EMPTY;
final String excludedUrlsStr = webConfig.getExcludedUrls() != null ? webConfig.getExcludedUrls() : StringUtil.EMPTY;
// num of threads
final CrawlerContext crawlerContext = crawler.getCrawlerContext();
final int numOfThread = webConfig.getNumOfThread() != null ? webConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_WEB;
crawlerContext.setNumOfThread(numOfThread);
// depth
final int depth = webConfig.getDepth() != null ? webConfig.getDepth() : -1;
crawlerContext.setMaxDepth(depth);
// max count
final long maxCount = webConfig.getMaxAccessCount() != null ? webConfig.getMaxAccessCount() : maxAccessCount;
crawlerContext.setMaxAccessCount(maxCount);
webConfig.initializeClientFactory(() -> crawler.getClientFactory());
final Map<String, String> configParamMap = webConfig.getConfigParameterMap(ConfigName.CONFIG);
if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_ALL))) {
deleteCrawlData(sid);
} else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_URL_FILTERS))) {
final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class);
try {
urlFilterService.delete(sid);
} catch (final Exception e) {
logger.warn("Failed to delete url filters for {}", sid);
}
}
final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper();
// set urls
split(urlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(urlValue -> {
if (!urlValue.startsWith("#") && fessConfig.isValidCrawlerWebProtocol(urlValue)) {
final String u = duplicateHostHelper.convert(urlValue);
crawler.addUrl(u);
if (logger.isInfoEnabled()) {
logger.info("Target URL: {}", u);
}
}
}));
// set included urls
split(includedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(urlValue -> {
if (!urlValue.startsWith("#")) {
crawler.addIncludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Included URL: {}", urlValue);
}
}
}));
// set excluded urls
split(excludedUrlsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(urlValue -> {
if (!urlValue.startsWith("#")) {
crawler.addExcludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Excluded URL: {}", urlValue);
}
}
}));
// failure url
final List<String> excludedUrlList = ComponentUtil.getCrawlingConfigHelper().getExcludedUrlList(webConfig.getConfigId());
if (excludedUrlList != null) {
excludedUrlList.stream().filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(u -> {
final String urlValue = Pattern.quote(u);
crawler.addExcludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Excluded URL from failures: {}", urlValue);
}
});
}
if (logger.isDebugEnabled()) {
logger.debug("Crawling {}", urlsStr);
}
crawler.setBackground(true);
crawler.setThreadPriority(crawlerPriority);
crawlerList.add(crawler);
crawlerStatusList.add(Constants.READY);
}
// File
for (final FileConfig fileConfig : fileConfigList) {
final String sid = ComponentUtil.getCrawlingConfigHelper().store(sessionId, fileConfig);
// create crawler
final Crawler crawler = ComponentUtil.getComponent(Crawler.class);
crawler.setSessionId(sid);
sessionIdList.add(sid);
final String pathsStr = fileConfig.getPaths();
if (StringUtil.isBlank(pathsStr)) {
logger.warn("No target uris. Skipped");
break;
}
final int intervalTime = fileConfig.getIntervalTime() != null ? fileConfig.getIntervalTime() : Constants.DEFAULT_INTERVAL_TIME_FOR_FS;
((FessIntervalController) crawler.getIntervalController()).setDelayMillisForWaitingNewUrl(intervalTime);
final String includedPathsStr = fileConfig.getIncludedPaths() != null ? fileConfig.getIncludedPaths() : StringUtil.EMPTY;
final String excludedPathsStr = fileConfig.getExcludedPaths() != null ? fileConfig.getExcludedPaths() : StringUtil.EMPTY;
// num of threads
final CrawlerContext crawlerContext = crawler.getCrawlerContext();
final int numOfThread = fileConfig.getNumOfThread() != null ? fileConfig.getNumOfThread() : Constants.DEFAULT_NUM_OF_THREAD_FOR_FS;
crawlerContext.setNumOfThread(numOfThread);
// depth
final int depth = fileConfig.getDepth() != null ? fileConfig.getDepth() : -1;
crawlerContext.setMaxDepth(depth);
// max count
final long maxCount = fileConfig.getMaxAccessCount() != null ? fileConfig.getMaxAccessCount() : maxAccessCount;
crawlerContext.setMaxAccessCount(maxCount);
fileConfig.initializeClientFactory(() -> crawler.getClientFactory());
final Map<String, String> configParamMap = fileConfig.getConfigParameterMap(ConfigName.CONFIG);
if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_ALL))) {
deleteCrawlData(sid);
} else if (Constants.TRUE.equalsIgnoreCase(configParamMap.get(Config.CLEANUP_URL_FILTERS))) {
final EsUrlFilterService urlFilterService = ComponentUtil.getComponent(EsUrlFilterService.class);
try {
urlFilterService.delete(sid);
} catch (final Exception e) {
logger.warn("Failed to delete url filters for {}", sid);
}
}
// set paths
split(pathsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(urlValue -> {
if (!urlValue.startsWith("#")) {
final String u;
if (!fessConfig.isValidCrawlerFileProtocol(urlValue)) {
if (urlValue.startsWith("/")) {
u = "file:" + urlValue;
} else {
u = "file:/" + urlValue;
}
} else {
u = urlValue;
}
crawler.addUrl(u);
if (logger.isInfoEnabled()) {
logger.info("Target Path: {}", u);
}
}
}));
// set included paths
final AtomicBoolean urlEncodeDisabled = new AtomicBoolean(false);
split(includedPathsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(line -> {
if (!line.startsWith("#")) {
final String urlValue;
if (urlEncodeDisabled.get()) {
urlValue = line;
urlEncodeDisabled.set(false);
} else {
urlValue = systemHelper.encodeUrlFilter(line);
}
crawler.addIncludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Included Path: {}", urlValue);
}
} else if (line.startsWith("#DISABLE_URL_ENCODE")) {
urlEncodeDisabled.set(true);
}
}));
// set excluded paths
urlEncodeDisabled.set(false);
split(excludedPathsStr, "[\r\n]").of(stream -> stream.filter(StringUtil::isNotBlank).map(String::trim).forEach(line -> {
if (!line.startsWith("#")) {
final String urlValue;
if (urlEncodeDisabled.get()) {
urlValue = line;
urlEncodeDisabled.set(false);
} else {
urlValue = systemHelper.encodeUrlFilter(line);
}
crawler.addExcludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Excluded Path: {}", urlValue);
}
} else if (line.startsWith("#DISABLE_URL_ENCODE")) {
urlEncodeDisabled.set(true);
}
}));
// failure url
final List<String> excludedUrlList = ComponentUtil.getCrawlingConfigHelper().getExcludedUrlList(fileConfig.getConfigId());
if (excludedUrlList != null) {
excludedUrlList.stream().filter(StringUtil::isNotBlank).map(String::trim).distinct().forEach(u -> {
final String urlValue = Pattern.quote(u);
crawler.addExcludeFilter(urlValue);
if (logger.isInfoEnabled()) {
logger.info("Excluded Path from failures: {}", urlValue);
}
});
}
if (logger.isDebugEnabled()) {
logger.debug("Crawling {}", pathsStr);
}
crawler.setBackground(true);
crawler.setThreadPriority(crawlerPriority);
crawlerList.add(crawler);
crawlerStatusList.add(Constants.READY);
}
// run index update
final IndexUpdater indexUpdater = ComponentUtil.getIndexUpdater();
indexUpdater.setName("IndexUpdater");
indexUpdater.setPriority(indexUpdaterPriority);
indexUpdater.setSessionIdList(sessionIdList);
indexUpdater.setDaemon(true);
indexUpdater.setCrawlerList(crawlerList);
getAvailableBoostDocumentRuleList().forEach(rule -> {
indexUpdater.addDocBoostMatcher(new org.codelibs.fess.indexer.DocBoostMatcher(rule));
});
indexUpdater.start();
int startedCrawlerNum = 0;
int activeCrawlerNum = 0;
while (startedCrawlerNum < crawlerList.size()) {
// Force to stop crawl
if (systemHelper.isForceStop()) {
for (final Crawler crawler : crawlerList) {
crawler.stop();
}
break;
}
if (activeCrawlerNum < multiprocessCrawlingCount) {
// start crawling
crawlerList.get(startedCrawlerNum).execute();
crawlerStatusList.set(startedCrawlerNum, Constants.RUNNING);
startedCrawlerNum++;
activeCrawlerNum++;
ThreadUtil.sleep(crawlingExecutionInterval);
continue;
}
// check status
for (int i = 0; i < startedCrawlerNum; i++) {
if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE && Constants.RUNNING.equals(crawlerStatusList.get(i))) {
crawlerList.get(i).awaitTermination();
crawlerStatusList.set(i, Constants.DONE);
final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
indexUpdater.addFinishedSessionId(sid);
activeCrawlerNum--;
}
}
ThreadUtil.sleep(crawlingExecutionInterval);
}
boolean finishedAll = false;
while (!finishedAll) {
finishedAll = true;
for (int i = 0; i < crawlerList.size(); i++) {
crawlerList.get(i).awaitTermination(crawlingExecutionInterval);
if (crawlerList.get(i).getCrawlerContext().getStatus() == CrawlerStatus.DONE && !Constants.DONE.equals(crawlerStatusList.get(i))) {
crawlerStatusList.set(i, Constants.DONE);
final String sid = crawlerList.get(i).getCrawlerContext().getSessionId();
indexUpdater.addFinishedSessionId(sid);
}
if (!Constants.DONE.equals(crawlerStatusList.get(i))) {
finishedAll = false;
}
}
}
crawlerList.clear();
crawlerStatusList.clear();
// put cralwing info
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final long execTime = System.currentTimeMillis() - startTime;
crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_CRAWLING_EXEC_TIME, Long.toString(execTime));
if (logger.isInfoEnabled()) {
logger.info("[EXEC TIME] crawling time: {}ms", execTime);
}
indexUpdater.setFinishCrawling(true);
try {
indexUpdater.join();
} catch (final InterruptedException e) {
logger.warn("Interrupted index update.", e);
}
crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_EXEC_TIME, Long.toString(indexUpdater.getExecuteTime()));
crawlingInfoHelper.putToInfoMap(Constants.WEB_FS_INDEX_SIZE, Long.toString(indexUpdater.getDocumentSize()));
if (systemHelper.isForceStop()) {
return;
}
for (final String sid : sessionIdList) {
// remove config
ComponentUtil.getCrawlingConfigHelper().remove(sid);
deleteCrawlData(sid);
}
}
Aggregations