use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.
the class FessXpathTransformer method putAdditionalData.
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
// canonical
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) {
final Set<RequestData> childUrlSet = new HashSet<>();
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
}
}
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
final String mimeType = responseData.getMimeType();
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
String urlEncoding;
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
if (urlQueue != null && urlQueue.getEncoding() != null) {
urlEncoding = urlQueue.getEncoding();
} else {
urlEncoding = responseData.getCharSet();
}
// cid
final String configId = crawlingConfig.getConfigId();
if (configId != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
}
// expires
if (documentExpires != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
}
// lang
final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
if (lang != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
}
// title
// content
final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap));
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
String charSet = responseData.getCharSet();
if (charSet == null) {
charSet = Constants.UTF_8;
}
try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
// cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
} catch (final Exception e) {
logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);
}
} else {
logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
}
}
// digest
final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
if (StringUtil.isNotBlank(digest)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
}
// segment
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
// host
putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
// site
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
// filename
final String fileName = getFileName(url, urlEncoding);
if (StringUtil.isNotBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
}
// url
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
// created
final Date now = systemHelper.getCurrentTime();
putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
// anchor
putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
// mimetype
putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
if (fileTypeHelper != null) {
// filetype
putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
}
// content_length
putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
// last_modified
final Date lastModified = responseData.getLastModified();
if (lastModified != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
} else {
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
}
// indexingTarget
putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
// boost
putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
// label: labelType
final Set<String> labelTypeSet = new HashSet<>();
for (final String labelType : crawlingConfig.getLabelTypeValues()) {
labelTypeSet.add(labelType);
}
labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
// role: roleType
final List<String> roleTypeList = new ArrayList<>();
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
// id
putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
// parentId
String parentUrl = responseData.getParentUrl();
if (StringUtil.isNotBlank(parentUrl)) {
parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
// set again
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
}
// thumbnail
final String thumbnailUrl = getThumbnailUrl(responseData, document);
if (StringUtil.isNotBlank(thumbnailUrl)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
}
// from config
final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
final String key = e.getKey();
final String value = getSingleNodeValue(document, e.getValue(), true);
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
});
crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
final String key = e.getKey();
final String value = e.getValue();
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
});
}
use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.
the class FessCrawlerThread method processResponse.
@Override
protected void processResponse(final UrlQueue<?> urlQueue, final ResponseData responseData) {
super.processResponse(urlQueue, responseData);
FessConfig fessConfig = ComponentUtil.getFessConfig();
if (fessConfig.isCrawlerFailureUrlStatusCodes(responseData.getHttpStatusCode())) {
String sessionId = crawlerContext.getSessionId();
final CrawlingConfig crawlingConfig = ComponentUtil.getCrawlingConfigHelper().get(sessionId);
final String url = urlQueue.getUrl();
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(crawlingConfig, ContentNotFoundException.class.getCanonicalName(), url, new ContentNotFoundException(url));
}
}
use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.
the class CrawlerLogHelper method storeFailureUrl.
private void storeFailureUrl(final CrawlerContext crawlerContext, final UrlQueue urlQueue, final String errorName, final Throwable e) {
final CrawlingConfig crawlingConfig = getCrawlingConfig(crawlerContext.getSessionId());
final String url = urlQueue.getUrl();
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(crawlingConfig, errorName, url, e);
}
use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.
the class GitBucketDataStoreImpl method storeData.
@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
final String rootURL = getRootURL(paramMap);
final String authToken = getAuthToken(paramMap);
final long readInterval = getReadInterval(paramMap);
// Non-emptiness Check for URL and Token
if (rootURL.isEmpty() || authToken.isEmpty()) {
logger.warn("parameter \"" + TOKEN_PARAM + "\" and \"" + GITBUCKET_URL_PARAM + "\" are required");
return;
}
// Get List of Repositories
final List<Map<String, Object>> repositoryList = getRepositoryList(rootURL, authToken);
if (repositoryList.isEmpty()) {
logger.warn("Token is invalid or no Repository");
return;
}
// Get Labels
final Map<String, String> pluginInfo = getFessPluginInfo(rootURL, authToken);
final String sourceLabel = pluginInfo.get("source_label");
final String issueLabel = pluginInfo.get("issue_label");
final String wikiLabel = pluginInfo.get("wiki_label");
final CrawlingConfig crawlingConfig = new CrawlingConfigWrapper(dataConfig) {
@Override
public Map<String, Object> initializeClientFactory(final CrawlerClientFactory crawlerClientFactory) {
final Map<String, Object> paramMap = super.initializeClientFactory(crawlerClientFactory);
final List<RequestHeader> headerList = new ArrayList<>();
final RequestHeader[] headers = (RequestHeader[]) paramMap.get(HcHttpClient.REQUERT_HEADERS_PROPERTY);
if (headers != null) {
for (final RequestHeader header : headers) {
headerList.add(header);
}
}
headerList.add(new RequestHeader("Authorization", "token " + authToken));
headerList.add(new RequestHeader("Accept", "application/vnd.github.v3.raw"));
paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY, headerList.toArray(new RequestHeader[headerList.size()]));
return paramMap;
}
};
// Crawl each repository
for (final Map<String, Object> repository : repositoryList) {
try {
final String owner = (String) repository.get("owner");
final String name = (String) repository.get("name");
final String refStr = getGitRef(rootURL, authToken, owner, name, "master");
final int issueCount = (int) repository.get("issue_count");
final int pullCount = (int) repository.get("pull_count");
final List<String> roleList = createRoleList(owner, repository);
logger.info("Crawl " + owner + "/" + name);
// crawl and store file contents recursively
crawlFileContents(rootURL, authToken, owner, name, refStr, StringUtil.EMPTY, 0, readInterval, path -> {
storeFileContent(rootURL, authToken, sourceLabel, owner, name, refStr, roleList, path, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap);
if (readInterval > 0) {
sleep(readInterval);
}
});
logger.info("Crawl issues in " + owner + "/" + name);
// store issues
for (int issueId = 1; issueId <= issueCount + pullCount; issueId++) {
storeIssueById(rootURL, authToken, issueLabel, owner, name, new Integer(issueId), roleList, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap);
if (readInterval > 0) {
sleep(readInterval);
}
}
logger.info("Crawl Wiki in " + owner + "/" + name);
// crawl Wiki
storeWikiContents(rootURL, authToken, wikiLabel, owner, name, roleList, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap, readInterval);
} catch (final Exception e) {
logger.warn("Failed to access to " + repository, e);
}
}
}
use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.
the class FessCrawlerThread method isContentUpdated.
@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
final long startTime = System.currentTimeMillis();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final SambaHelper sambaHelper = ComponentUtil.getSambaHelper();
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
final String url = urlQueue.getUrl();
ResponseData responseData = null;
try {
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
final Map<String, Object> dataMap = new HashMap<>();
dataMap.put(fessConfig.getIndexFieldUrl(), url);
final List<String> roleTypeList = new ArrayList<>();
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
if (url.startsWith("smb://")) {
if (url.endsWith("/")) {
// directory
return true;
}
if (fessConfig.isSmbRoleFromFile()) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
final ACE[] aces = (ACE[]) responseData.getMetaDataMap().get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES);
if (aces != null) {
for (final ACE item : aces) {
final SID sid = item.getSID();
final String accountId = sambaHelper.getAccountId(sid);
if (accountId != null) {
roleTypeList.add(accountId);
}
}
if (logger.isDebugEnabled()) {
logger.debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString());
}
}
}
}
dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
final String id = crawlingInfoHelper.generateId(dataMap);
if (logger.isDebugEnabled()) {
logger.debug("Searching indexed document: " + id);
}
final Map<String, Object> document = indexingHelper.getDocument(fessEsClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
if (document == null) {
storeChildUrlsToQueue(urlQueue, getChildUrlSet(fessEsClient, id));
return true;
}
final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
if (expires != null && expires.getTime() < System.currentTimeMillis()) {
final Object idValue = document.get(fessConfig.getIndexFieldId());
if (idValue != null && !indexingHelper.deleteDocument(fessEsClient, idValue.toString())) {
logger.debug("Failed to delete expired document: " + url);
}
return true;
}
final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
if (lastModified == null) {
return true;
}
urlQueue.setLastModified(lastModified.getTime());
log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
if (responseData == null) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
}
final int httpStatusCode = responseData.getHttpStatusCode();
if (logger.isDebugEnabled()) {
logger.debug("Accessing document: " + url + ", status: " + httpStatusCode);
}
if (httpStatusCode == 404) {
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
if (!indexingHelper.deleteDocument(fessEsClient, id)) {
logger.debug("Failed to delete 404 document: " + url);
}
return false;
} else if (responseData.getLastModified() == null) {
return true;
} else if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setParentUrl(urlQueue.getParentUrl());
responseData.setSessionId(crawlerContext.getSessionId());
responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
processResponse(urlQueue, responseData);
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
if (documentExpires != null && !indexingHelper.updateDocument(fessEsClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
logger.debug("Failed to update " + fessConfig.getIndexFieldExpires() + " at " + url);
}
return false;
}
} finally {
if (responseData != null) {
IOUtils.closeQuietly(responseData);
}
}
}
return true;
}
Aggregations