use of org.codelibs.core.lang.StringUtil in project fess by codelibs.
the class FessXpathTransformer method putAdditionalData.
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
// canonical
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl) && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
final Set<RequestData> childUrlSet = new HashSet<>();
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
logger.info("CANONICAL: {} -> {}", responseData.getUrl(), canonicalUrl);
throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData");
}
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
final String mimeType = responseData.getMimeType();
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
String urlEncoding;
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
if (urlQueue != null && urlQueue.getEncoding() != null) {
urlEncoding = urlQueue.getEncoding();
} else {
urlEncoding = responseData.getCharSet();
}
// cid
final String configId = crawlingConfig.getConfigId();
if (configId != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
}
// expires
if (documentExpires != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
}
// lang
final String lang = systemHelper.normalizeHtmlLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
if (lang != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
}
// title
// content
final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(crawlingConfig, responseData, body, dataMap));
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
String charSet = responseData.getCharSet();
if (charSet == null) {
charSet = Constants.UTF_8;
}
try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
// cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
} catch (final Exception e) {
logger.warn("Failed to write a cache: {}:{}", sessionId, responseData, e);
}
} else {
logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
}
}
// digest
final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
if (StringUtil.isNotBlank(digest)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
}
// segment
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
// host
putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
// site
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
// filename
final String fileName = getFileName(url, urlEncoding);
if (StringUtil.isNotBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
}
// url
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
// created
final Date now = systemHelper.getCurrentTime();
putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
// anchor
putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
// mimetype
putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
if (fileTypeHelper != null) {
// filetype
putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
}
// content_length
putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
// last_modified
final Date lastModified = responseData.getLastModified();
if (lastModified != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
} else {
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
}
// indexingTarget
putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
// boost
putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
// label: labelType
putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeHelper.getMatchedLabelValueSet(url));
// role: roleType
final List<String> roleTypeList = new ArrayList<>();
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
// virtualHosts
putResultDataBody(dataMap, fessConfig.getIndexFieldVirtualHost(), stream(crawlingConfig.getVirtualHosts()).get(stream -> stream.filter(StringUtil::isNotBlank).collect(Collectors.toList())));
// id
putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
// parentId
String parentUrl = responseData.getParentUrl();
if (StringUtil.isNotBlank(parentUrl)) {
parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
// set again
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
}
// thumbnail
final String thumbnailUrl = getThumbnailUrl(responseData, document);
if (StringUtil.isNotBlank(thumbnailUrl)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
}
// from config
final String scriptType = crawlingConfig.getScriptType();
final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
final String key = e.getKey();
final String value = getSingleNodeValue(document, e.getValue(), true);
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
});
crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
final String key = e.getKey();
final String value = e.getValue();
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
});
}
use of org.codelibs.core.lang.StringUtil in project fess by codelibs.
the class SamlCredential method getDefaultRolesAsArray.
protected String[] getDefaultRolesAsArray() {
final List<String> list = new ArrayList<>();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final String key = fessConfig.getSystemProperty("saml.attribute.role.name");
if (StringUtil.isNotBlank(key)) {
final List<String> nameList = attributes.get(key);
if (nameList != null) {
list.addAll(nameList);
}
}
final String value = fessConfig.getSystemProperty("saml.default.roles");
if (StringUtil.isNotBlank(value)) {
split(value, ",").of(stream -> stream.forEach(list::add));
}
return list.stream().filter(StringUtil::isNotBlank).map(String::trim).toArray(n -> new String[n]);
}
use of org.codelibs.core.lang.StringUtil in project fess by codelibs.
the class SuggestHelper method init.
@PostConstruct
public void init() {
if (logger.isDebugEnabled()) {
logger.debug("Initialize {}", this.getClass().getSimpleName());
}
fessConfig = ComponentUtil.getFessConfig();
split(fessConfig.getSuggestFieldContents(), ",").of(stream -> stream.filter(StringUtil::isNotBlank).forEach(contentFieldNameSet::add));
split(fessConfig.getSuggestFieldTags(), ",").of(stream -> stream.filter(StringUtil::isNotBlank).forEach(tagFieldNameSet::add));
split(fessConfig.getSuggestFieldRoles(), ",").of(stream -> stream.filter(StringUtil::isNotBlank).forEach(roleFieldNameSet::add));
contentFieldList = Arrays.asList(stream(fessConfig.getSuggestFieldContents()).get(stream -> stream.toArray(n -> new String[n])));
final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
searchEngineClient.admin().cluster().prepareHealth().setWaitForYellowStatus().execute().actionGet(fessConfig.getIndexHealthTimeout());
final SuggestSettingsBuilder settingsBuilder = SuggestSettings.builder();
settingsBuilder.addInitialSettings("elasticsearch.type", fessConfig.getFesenType());
settingsBuilder.bulkTimeout(fessConfig.getIndexBulkTimeout());
settingsBuilder.clusterTimeout(fessConfig.getIndexHealthTimeout());
settingsBuilder.indexTimeout(fessConfig.getIndexIndexTimeout());
settingsBuilder.indicesTimeout(fessConfig.getIndexIndicesTimeout());
settingsBuilder.searchTimeout(fessConfig.getIndexSearchTimeout());
suggester = Suggester.builder().settings(settingsBuilder).build(searchEngineClient, fessConfig.getIndexDocumentSuggestIndex());
suggester.settings().array().delete(SuggestSettings.DefaultKeys.SUPPORTED_FIELDS);
split(fessConfig.getSuggestFieldIndexContents(), ",").of(stream -> stream.filter(StringUtil::isNotBlank).forEach(field -> {
try {
suggester.settings().array().add(SuggestSettings.DefaultKeys.SUPPORTED_FIELDS, field);
} catch (final SuggestSettingsException e) {
logger.warn("Failed to add {}", field, e);
}
}));
suggester.createIndexIfNothing();
if (ComponentUtil.hasPopularWordHelper()) {
popularWordHelper = ComponentUtil.getPopularWordHelper();
}
}
use of org.codelibs.core.lang.StringUtil in project fess by codelibs.
the class SystemHelper method setLogLevel.
public void setLogLevel(final String level) {
final Level logLevel = Level.toLevel(level, Level.WARN);
System.setProperty(Constants.FESS_LOG_LEVEL, logLevel.toString());
split(ComponentUtil.getFessConfig().getLoggingAppPackages(), ",").of(stream -> stream.map(String::trim).filter(StringUtil::isNotEmpty).forEach(s -> Configurator.setLevel(s, logLevel)));
}
use of org.codelibs.core.lang.StringUtil in project fess by codelibs.
the class PopularWordHelper method getWordList.
public List<String> getWordList(final SearchRequestType searchRequestType, final String seed, final String[] tags, final String[] roles, final String[] fields, final String[] excludes) {
final String baseSeed = seed != null ? seed : fessConfig.getSuggestPopularWordSeed();
final String[] baseTags = tags != null ? tags : fessConfig.getSuggestPopularWordTagsAsArray();
final String[] baseRoles = roles != null ? roles : ComponentUtil.getRoleQueryHelper().build(searchRequestType).stream().filter(StringUtil::isNotBlank).toArray(n -> new String[n]);
final String[] baseFields = fields != null ? fields : fessConfig.getSuggestPopularWordFieldsAsArray();
final String[] baseExcludes = excludes != null ? excludes : fessConfig.getSuggestPopularWordExcludesAsArray();
try {
return cache.get(getCacheKey(baseSeed, baseTags, baseRoles, baseFields, baseExcludes), () -> {
final List<String> wordList = new ArrayList<>();
final SuggestHelper suggestHelper = ComponentUtil.getSuggestHelper();
final PopularWordsRequestBuilder popularWordsRequestBuilder = suggestHelper.suggester().popularWords().setSize(fessConfig.getSuggestPopularWordSizeAsInteger()).setWindowSize(fessConfig.getSuggestPopularWordWindowSizeAsInteger()).setQueryFreqThreshold(fessConfig.getSuggestPopularWordQueryFreqAsInteger());
popularWordsRequestBuilder.setSeed(baseSeed);
stream(baseTags).of(stream -> stream.forEach(tag -> popularWordsRequestBuilder.addTag(tag)));
stream(baseRoles).of(stream -> stream.forEach(role -> popularWordsRequestBuilder.addRole(role)));
stream(baseFields).of(stream -> stream.forEach(field -> popularWordsRequestBuilder.addField(field)));
stream(baseExcludes).of(stream -> stream.forEach(exclude -> popularWordsRequestBuilder.addExcludeWord(exclude)));
try {
popularWordsRequestBuilder.execute().getResponse().getItems().stream().forEach(item -> wordList.add(item.getText()));
} catch (final SuggesterException e) {
logger.warn("Failed to generate popular words.", e);
}
return wordList;
});
} catch (final ExecutionException e) {
logger.warn("Failed to load popular words.", e);
}
return Collections.emptyList();
}
Aggregations