use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.
the class FessXpathTransformer method putAdditionalData.
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
// canonical
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) {
final Set<RequestData> childUrlSet = new HashSet<>();
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
}
}
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
final String mimeType = responseData.getMimeType();
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
String urlEncoding;
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
if (urlQueue != null && urlQueue.getEncoding() != null) {
urlEncoding = urlQueue.getEncoding();
} else {
urlEncoding = responseData.getCharSet();
}
// cid
final String configId = crawlingConfig.getConfigId();
if (configId != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
}
// expires
if (documentExpires != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
}
// lang
final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
if (lang != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
}
// title
// content
final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap));
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
String charSet = responseData.getCharSet();
if (charSet == null) {
charSet = Constants.UTF_8;
}
try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
// cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
} catch (final Exception e) {
logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);
}
} else {
logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
}
}
// digest
final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
if (StringUtil.isNotBlank(digest)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
}
// segment
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
// host
putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
// site
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
// filename
final String fileName = getFileName(url, urlEncoding);
if (StringUtil.isNotBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
}
// url
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
// created
final Date now = systemHelper.getCurrentTime();
putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
// anchor
putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
// mimetype
putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
if (fileTypeHelper != null) {
// filetype
putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
}
// content_length
putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
// last_modified
final Date lastModified = responseData.getLastModified();
if (lastModified != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
} else {
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
}
// indexingTarget
putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
// boost
putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
// label: labelType
final Set<String> labelTypeSet = new HashSet<>();
for (final String labelType : crawlingConfig.getLabelTypeValues()) {
labelTypeSet.add(labelType);
}
labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
// role: roleType
final List<String> roleTypeList = new ArrayList<>();
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
// id
putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
// parentId
String parentUrl = responseData.getParentUrl();
if (StringUtil.isNotBlank(parentUrl)) {
parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
// set again
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
}
// thumbnail
final String thumbnailUrl = getThumbnailUrl(responseData, document);
if (StringUtil.isNotBlank(thumbnailUrl)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
}
// from config
final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
final String key = e.getKey();
final String value = getSingleNodeValue(document, e.getValue(), true);
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
});
crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
final String key = e.getKey();
final String value = e.getValue();
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
});
}
use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.
the class AbstractDataStoreImpl method store.
@Override
public void store(final DataConfig config, final IndexUpdateCallback callback, final Map<String, String> initParamMap) {
final Map<String, String> configParamMap = config.getHandlerParameterMap();
final Map<String, String> configScriptMap = config.getHandlerScriptMap();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(config);
final FessConfig fessConfig = ComponentUtil.getFessConfig();
initParamMap.putAll(configParamMap);
final Map<String, String> paramMap = initParamMap;
// default values
final Map<String, Object> defaultDataMap = new HashMap<>();
// cid
final String configId = config.getConfigId();
if (configId != null) {
defaultDataMap.put(fessConfig.getIndexFieldConfigId(), configId);
}
// expires
if (documentExpires != null) {
defaultDataMap.put(fessConfig.getIndexFieldExpires(), documentExpires);
}
// segment
defaultDataMap.put(fessConfig.getIndexFieldSegment(), initParamMap.get(Constants.SESSION_ID));
// created
defaultDataMap.put(fessConfig.getIndexFieldCreated(), systemHelper.getCurrentTime());
// boost
defaultDataMap.put(fessConfig.getIndexFieldBoost(), config.getBoost().toString());
// label: labelType
final List<String> labelTypeList = new ArrayList<>();
for (final String labelType : config.getLabelTypeValues()) {
labelTypeList.add(labelType);
}
defaultDataMap.put(fessConfig.getIndexFieldLabel(), labelTypeList);
// role: roleType
final List<String> roleTypeList = new ArrayList<>();
stream(config.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
defaultDataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
// mimetype
defaultDataMap.put(fessConfig.getIndexFieldMimetype(), mimeType);
// title
// content
// cache
// digest
// host
// site
// url
// anchor
// content_length
// last_modified
// id
storeData(config, callback, paramMap, configScriptMap, defaultDataMap);
}
use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.
the class Crawler method doCrawl.
public int doCrawl(final Options options) {
if (logger.isInfoEnabled()) {
logger.info("Starting Crawler..");
}
final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
final long totalTime = System.currentTimeMillis();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
boolean completed = false;
try {
writeTimeToSessionInfo(crawlingInfoHelper, Constants.CRAWLER_START_TIME);
// setup path mapping
final List<String> ptList = new ArrayList<>();
ptList.add(Constants.PROCESS_TYPE_CRAWLING);
ptList.add(Constants.PROCESS_TYPE_BOTH);
pathMappingHelper.setPathMappingList(options.sessionId, pathMappingService.getPathMappingList(ptList));
// duplicate host
try {
final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper();
duplicateHostHelper.init();
} catch (final Exception e) {
logger.warn("Could not initialize duplicateHostHelper.", e);
}
// delete expired sessions
crawlingInfoService.deleteSessionIdsBefore(options.sessionId, options.name, ComponentUtil.getSystemHelper().getCurrentTimeAsLong());
final List<String> webConfigIdList = options.getWebConfigIdList();
final List<String> fileConfigIdList = options.getFileConfigIdList();
final List<String> dataConfigIdList = options.getDataConfigIdList();
final boolean runAll = webConfigIdList == null && fileConfigIdList == null && dataConfigIdList == null;
Thread webFsCrawlerThread = null;
Thread dataCrawlerThread = null;
if (runAll || webConfigIdList != null || fileConfigIdList != null) {
webFsCrawlerThread = new Thread((Runnable) () -> {
writeTimeToSessionInfo(crawlingInfoHelper, Constants.WEB_FS_CRAWLER_START_TIME);
webFsIndexHelper.crawl(options.sessionId, webConfigIdList, fileConfigIdList);
writeTimeToSessionInfo(crawlingInfoHelper, Constants.WEB_FS_CRAWLER_END_TIME);
}, WEB_FS_CRAWLING_PROCESS);
webFsCrawlerThread.start();
}
if (runAll || dataConfigIdList != null) {
dataCrawlerThread = new Thread((Runnable) () -> {
writeTimeToSessionInfo(crawlingInfoHelper, Constants.DATA_CRAWLER_START_TIME);
dataIndexHelper.crawl(options.sessionId, dataConfigIdList);
writeTimeToSessionInfo(crawlingInfoHelper, Constants.DATA_CRAWLER_END_TIME);
}, DATA_CRAWLING_PROCESS);
dataCrawlerThread.start();
}
joinCrawlerThread(webFsCrawlerThread);
joinCrawlerThread(dataCrawlerThread);
if (logger.isInfoEnabled()) {
logger.info("Finished Crawler");
}
completed = true;
return Constants.EXIT_OK;
} catch (final Throwable t) {
logger.warn("An exception occurs on the crawl task.", t);
return Constants.EXIT_FAIL;
} finally {
pathMappingHelper.removePathMappingList(options.sessionId);
crawlingInfoHelper.putToInfoMap(Constants.CRAWLER_STATUS, completed ? Constants.T.toString() : Constants.F.toString());
writeTimeToSessionInfo(crawlingInfoHelper, Constants.CRAWLER_END_TIME);
crawlingInfoHelper.putToInfoMap(Constants.CRAWLER_EXEC_TIME, Long.toString(System.currentTimeMillis() - totalTime));
}
}
use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.
the class IndexUpdateCallbackImpl method store.
/* (non-Javadoc)
* @see org.codelibs.fess.ds.impl.IndexUpdateCallback#store(java.util.Map)
*/
@Override
public void store(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
final long startTime = System.currentTimeMillis();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
if (logger.isDebugEnabled()) {
logger.debug("Adding " + dataMap);
}
// required check
final Object urlObj = dataMap.get(fessConfig.getIndexFieldUrl());
if (urlObj == null) {
throw new DataStoreException("url is null. dataMap=" + dataMap);
}
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
dataMap.put(fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
if (fessConfig.getIndexerClickCountEnabledAsBoolean()) {
addClickCountField(dataMap, url, fessConfig.getIndexFieldClickCount());
}
if (fessConfig.getIndexerFavoriteCountEnabledAsBoolean()) {
addFavoriteCountField(dataMap, url, fessConfig.getIndexFieldFavoriteCount());
}
if (!dataMap.containsKey(fessConfig.getIndexFieldDocId())) {
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
dataMap.put(fessConfig.getIndexFieldDocId(), systemHelper.generateDocId(dataMap));
}
synchronized (docList) {
docList.add(dataMap);
if (logger.isDebugEnabled()) {
logger.debug("Added the document. " + "The number of a document cache is " + docList.size() + ".");
}
final Long contentLength = DocumentUtil.getValue(dataMap, fessConfig.getIndexFieldContentLength(), Long.class);
if (contentLength != null) {
docList.addContentSize(contentLength.longValue());
if (docList.getContentSize() >= maxDocumentRequestSize) {
indexingHelper.sendDocuments(fessEsClient, docList);
}
} else if (docList.size() >= fessConfig.getIndexerDataMaxDocumentCacheSizeAsInteger().intValue()) {
indexingHelper.sendDocuments(fessEsClient, docList);
}
executeTime += System.currentTimeMillis() - startTime;
}
documentSize.getAndIncrement();
if (logger.isDebugEnabled()) {
logger.debug("The number of an added document is " + documentSize.get() + ".");
}
}
use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.
the class FessCrawlerThread method isContentUpdated.
@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
final long startTime = System.currentTimeMillis();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final SambaHelper sambaHelper = ComponentUtil.getSambaHelper();
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
final String url = urlQueue.getUrl();
ResponseData responseData = null;
try {
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
final Map<String, Object> dataMap = new HashMap<>();
dataMap.put(fessConfig.getIndexFieldUrl(), url);
final List<String> roleTypeList = new ArrayList<>();
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
if (url.startsWith("smb://")) {
if (url.endsWith("/")) {
// directory
return true;
}
if (fessConfig.isSmbRoleFromFile()) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
final ACE[] aces = (ACE[]) responseData.getMetaDataMap().get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES);
if (aces != null) {
for (final ACE item : aces) {
final SID sid = item.getSID();
final String accountId = sambaHelper.getAccountId(sid);
if (accountId != null) {
roleTypeList.add(accountId);
}
}
if (logger.isDebugEnabled()) {
logger.debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString());
}
}
}
}
dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
final String id = crawlingInfoHelper.generateId(dataMap);
if (logger.isDebugEnabled()) {
logger.debug("Searching indexed document: " + id);
}
final Map<String, Object> document = indexingHelper.getDocument(fessEsClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
if (document == null) {
storeChildUrlsToQueue(urlQueue, getChildUrlSet(fessEsClient, id));
return true;
}
final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
if (expires != null && expires.getTime() < System.currentTimeMillis()) {
final Object idValue = document.get(fessConfig.getIndexFieldId());
if (idValue != null && !indexingHelper.deleteDocument(fessEsClient, idValue.toString())) {
logger.debug("Failed to delete expired document: " + url);
}
return true;
}
final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
if (lastModified == null) {
return true;
}
urlQueue.setLastModified(lastModified.getTime());
log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
if (responseData == null) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
}
final int httpStatusCode = responseData.getHttpStatusCode();
if (logger.isDebugEnabled()) {
logger.debug("Accessing document: " + url + ", status: " + httpStatusCode);
}
if (httpStatusCode == 404) {
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
if (!indexingHelper.deleteDocument(fessEsClient, id)) {
logger.debug("Failed to delete 404 document: " + url);
}
return false;
} else if (responseData.getLastModified() == null) {
return true;
} else if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setParentUrl(urlQueue.getParentUrl());
responseData.setSessionId(crawlerContext.getSessionId());
responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
processResponse(urlQueue, responseData);
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
if (documentExpires != null && !indexingHelper.updateDocument(fessEsClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
logger.debug("Failed to update " + fessConfig.getIndexFieldExpires() + " at " + url);
}
return false;
}
} finally {
if (responseData != null) {
IOUtils.closeQuietly(responseData);
}
}
}
return true;
}
Aggregations