use of org.codelibs.fess.helper.IndexingHelper in project fess by codelibs.
the class FessCrawlerThread method getChildUrlSet.
protected Set<RequestData> getChildUrlSet(final SearchEngineClient searchEngineClient, final String id) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final List<Map<String, Object>> docList = indexingHelper.getChildDocumentList(searchEngineClient, id, new String[] { fessConfig.getIndexFieldUrl() });
if (docList.isEmpty()) {
return null;
}
if (logger.isDebugEnabled()) {
logger.debug("Found documents: {}", docList);
}
final Set<RequestData> urlSet = new HashSet<>(docList.size());
for (final Map<String, Object> doc : docList) {
final String url = DocumentUtil.getValue(doc, fessConfig.getIndexFieldUrl(), String.class);
if (StringUtil.isNotBlank(url)) {
urlSet.add(RequestDataBuilder.newRequestData().get().url(url).build());
}
}
return urlSet;
}
use of org.codelibs.fess.helper.IndexingHelper in project fess by codelibs.
the class IndexUpdateCallbackImpl method store.
/* (non-Javadoc)
* @see org.codelibs.fess.ds.impl.IndexUpdateCallback#store(java.util.Map)
*/
@Override
public void store(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
final long startTime = System.currentTimeMillis();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
if (logger.isDebugEnabled()) {
logger.debug("Adding " + dataMap);
}
// required check
final Object urlObj = dataMap.get(fessConfig.getIndexFieldUrl());
if (urlObj == null) {
throw new DataStoreException("url is null. dataMap=" + dataMap);
}
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
dataMap.put(fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
if (fessConfig.getIndexerClickCountEnabledAsBoolean()) {
addClickCountField(dataMap, url, fessConfig.getIndexFieldClickCount());
}
if (fessConfig.getIndexerFavoriteCountEnabledAsBoolean()) {
addFavoriteCountField(dataMap, url, fessConfig.getIndexFieldFavoriteCount());
}
if (!dataMap.containsKey(fessConfig.getIndexFieldDocId())) {
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
dataMap.put(fessConfig.getIndexFieldDocId(), systemHelper.generateDocId(dataMap));
}
synchronized (docList) {
docList.add(dataMap);
if (logger.isDebugEnabled()) {
logger.debug("Added the document. " + "The number of a document cache is " + docList.size() + ".");
}
final Long contentLength = DocumentUtil.getValue(dataMap, fessConfig.getIndexFieldContentLength(), Long.class);
if (contentLength != null) {
docList.addContentSize(contentLength.longValue());
if (docList.getContentSize() >= maxDocumentRequestSize) {
indexingHelper.sendDocuments(fessEsClient, docList);
}
} else if (docList.size() >= fessConfig.getIndexerDataMaxDocumentCacheSizeAsInteger().intValue()) {
indexingHelper.sendDocuments(fessEsClient, docList);
}
executeTime += System.currentTimeMillis() - startTime;
}
documentSize.getAndIncrement();
if (logger.isDebugEnabled()) {
logger.debug("The number of an added document is " + documentSize.get() + ".");
}
}
use of org.codelibs.fess.helper.IndexingHelper in project fess by codelibs.
the class FessCrawlerThread method isContentUpdated.
@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
final long startTime = System.currentTimeMillis();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
final String url = urlQueue.getUrl();
ResponseData responseData = null;
try {
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
final Map<String, Object> dataMap = new HashMap<>();
dataMap.put(fessConfig.getIndexFieldUrl(), url);
final List<String> roleTypeList = new ArrayList<>();
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
if (url.startsWith("smb:") || url.startsWith("smb1:") || url.startsWith("file:") || url.startsWith("ftp:")) {
if (url.endsWith("/")) {
// directory
return true;
}
final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
roleTypeList.addAll(permissionHelper.getSmbRoleTypeList(responseData));
roleTypeList.addAll(permissionHelper.getFileRoleTypeList(responseData));
roleTypeList.addAll(permissionHelper.getFtpRoleTypeList(responseData));
}
}
dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
final String id = crawlingInfoHelper.generateId(dataMap);
if (logger.isDebugEnabled()) {
logger.debug("Searching indexed document: {}", id);
}
final Map<String, Object> document = indexingHelper.getDocument(searchEngineClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
if (document == null) {
storeChildUrlsToQueue(urlQueue, getChildUrlSet(searchEngineClient, id));
return true;
}
final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
if (expires != null && expires.getTime() < System.currentTimeMillis()) {
final Object idValue = document.get(fessConfig.getIndexFieldId());
if (idValue != null && !indexingHelper.deleteDocument(searchEngineClient, idValue.toString())) {
logger.debug("Failed to delete expired document: {}", url);
}
return true;
}
final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
if (lastModified == null) {
return true;
}
urlQueue.setLastModified(lastModified.getTime());
log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
if (responseData == null) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
}
final int httpStatusCode = responseData.getHttpStatusCode();
if (logger.isDebugEnabled()) {
logger.debug("Accessing document: {}, status: {}", url, httpStatusCode);
}
if (httpStatusCode == 404) {
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
if (!indexingHelper.deleteDocument(searchEngineClient, id)) {
logger.debug("Failed to delete 404 document: {}", url);
}
return false;
}
if (responseData.getLastModified() == null) {
return true;
}
if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setParentUrl(urlQueue.getParentUrl());
responseData.setSessionId(crawlerContext.getSessionId());
responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
processResponse(urlQueue, responseData);
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
if (documentExpires != null && !indexingHelper.updateDocument(searchEngineClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
logger.debug("Failed to update {} at {}", fessConfig.getIndexFieldExpires(), url);
}
return false;
}
} finally {
if (responseData != null) {
CloseableUtil.closeQuietly(responseData);
}
}
}
return true;
}
use of org.codelibs.fess.helper.IndexingHelper in project fess by codelibs.
the class BaseThumbnailGenerator method process.
protected boolean process(final String id, final BiPredicate<String, String> consumer) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
try {
final Map<String, Object> doc = indexingHelper.getDocument(searchEngineClient, id, new String[] { fessConfig.getIndexFieldThumbnail(), fessConfig.getIndexFieldConfigId() });
if (doc == null) {
throw new ThumbnailGenerationException("Document is not found: " + id);
}
final String url = DocumentUtil.getValue(doc, fessConfig.getIndexFieldThumbnail(), String.class);
if (StringUtil.isBlank(url)) {
throw new ThumbnailGenerationException("Invalid thumbnail: " + url);
}
final String configId = DocumentUtil.getValue(doc, fessConfig.getIndexFieldConfigId(), String.class);
if (configId == null || configId.length() < 2) {
throw new ThumbnailGenerationException("Invalid configId: " + configId);
}
return consumer.test(configId, url);
} catch (final ThumbnailGenerationException e) {
if (e.getCause() == null) {
logger.debug(e.getMessage());
} else {
logger.warn("Failed to process {}", id, e);
}
} catch (final Exception e) {
logger.warn("Failed to process {}", id, e);
}
return false;
}
use of org.codelibs.fess.helper.IndexingHelper in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method deleteDocuments.
protected void deleteDocuments() {
final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
for (final String url : deleteUrlList) {
indexingHelper.deleteDocumentByUrl(searchEngineClient, url);
}
if (logger.isDebugEnabled()) {
logger.debug("Deleted {}", deleteUrlList);
}
deleteUrlList.clear();
}
Aggregations