Search in sources :

Example 1 with CrawlingInfoHelper

use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.

the class FessXpathTransformer method putAdditionalData.

protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
    // canonical
    if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
        final String canonicalUrl = getCanonicalUrl(responseData, document);
        if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) {
            final Set<RequestData> childUrlSet = new HashSet<>();
            childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
            throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
        }
    }
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
    final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
    final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
    String url = responseData.getUrl();
    final String indexingTarget = crawlingConfig.getIndexingTarget(url);
    url = pathMappingHelper.replaceUrl(sessionId, url);
    final String mimeType = responseData.getMimeType();
    final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
    final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
    String urlEncoding;
    final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
    if (urlQueue != null && urlQueue.getEncoding() != null) {
        urlEncoding = urlQueue.getEncoding();
    } else {
        urlEncoding = responseData.getCharSet();
    }
    // cid
    final String configId = crawlingConfig.getConfigId();
    if (configId != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
    }
    //  expires
    if (documentExpires != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // lang
    final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
    if (lang != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
    }
    // title
    // content
    final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
    putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap));
    if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
        if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
            String charSet = responseData.getCharSet();
            if (charSet == null) {
                charSet = Constants.UTF_8;
            }
            try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
                // cache
                putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
                putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
            } catch (final Exception e) {
                logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);
            }
        } else {
            logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
        }
    }
    // digest
    final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
    if (StringUtil.isNotBlank(digest)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
    } else {
        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
    }
    // segment
    putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
    // host
    putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
    // site
    putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
    // filename
    final String fileName = getFileName(url, urlEncoding);
    if (StringUtil.isNotBlank(fileName)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
    }
    // url
    putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    // created
    final Date now = systemHelper.getCurrentTime();
    putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
    // anchor
    putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
    // mimetype
    putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
    if (fileTypeHelper != null) {
        // filetype
        putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
    }
    // content_length
    putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
    // last_modified
    final Date lastModified = responseData.getLastModified();
    if (lastModified != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
    } else {
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
    }
    // indexingTarget
    putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
    //  boost
    putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
    // label: labelType
    final Set<String> labelTypeSet = new HashSet<>();
    for (final String labelType : crawlingConfig.getLabelTypeValues()) {
        labelTypeSet.add(labelType);
    }
    labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
    putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
    // role: roleType
    final List<String> roleTypeList = new ArrayList<>();
    stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
    // id
    putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    // parentId
    String parentUrl = responseData.getParentUrl();
    if (StringUtil.isNotBlank(parentUrl)) {
        parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
        // set again
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    }
    // thumbnail
    final String thumbnailUrl = getThumbnailUrl(responseData, document);
    if (StringUtil.isNotBlank(thumbnailUrl)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
    }
    // from config
    final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
    xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
        final String key = e.getKey();
        final String value = getSingleNodeValue(document, e.getValue(), true);
        putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
    });
    crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
        final String key = e.getKey();
        final String value = e.getValue();
        putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
    });
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) BufferedInputStream(java.io.BufferedInputStream) URL(java.net.URL) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Locale(java.util.Locale) DOMParser(org.cyberneko.html.parsers.DOMParser) Document(org.w3c.dom.Document) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) InputStreamUtil(org.codelibs.core.io.InputStreamUtil) ResultData(org.codelibs.fess.crawler.entity.ResultData) Set(java.util.Set) List(java.util.List) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) PostConstruct(javax.annotation.PostConstruct) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) XObject(org.apache.xpath.objects.XObject) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) TransformerException(javax.xml.transform.TransformerException) HashMap(java.util.HashMap) SerializeUtil(org.codelibs.core.io.SerializeUtil) CrawlingParameterUtil(org.codelibs.fess.crawler.util.CrawlingParameterUtil) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) XpathTransformer(org.codelibs.fess.crawler.transformer.impl.XpathTransformer) Node(org.w3c.dom.Node) PrunedTag(org.codelibs.fess.util.PrunedTag) NamedNodeMap(org.w3c.dom.NamedNodeMap) InputSource(org.xml.sax.InputSource) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) NodeList(org.w3c.dom.NodeList) MalformedURLException(java.net.MalformedURLException) StringUtil(org.codelibs.core.lang.StringUtil) DuplicateHostHelper(org.codelibs.fess.helper.DuplicateHostHelper) ValueHolder(org.codelibs.core.misc.ValueHolder) SystemHelper(org.codelibs.fess.helper.SystemHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Collections(java.util.Collections) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) TransformerException(javax.xml.transform.TransformerException) MalformedURLException(java.net.MalformedURLException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) BufferedInputStream(java.io.BufferedInputStream) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) HashSet(java.util.HashSet)

Example 2 with CrawlingInfoHelper

use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.

the class AbstractDataStoreImpl method store.

@Override
public void store(final DataConfig config, final IndexUpdateCallback callback, final Map<String, String> initParamMap) {
    final Map<String, String> configParamMap = config.getHandlerParameterMap();
    final Map<String, String> configScriptMap = config.getHandlerScriptMap();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(config);
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    initParamMap.putAll(configParamMap);
    final Map<String, String> paramMap = initParamMap;
    // default values
    final Map<String, Object> defaultDataMap = new HashMap<>();
    // cid
    final String configId = config.getConfigId();
    if (configId != null) {
        defaultDataMap.put(fessConfig.getIndexFieldConfigId(), configId);
    }
    //  expires
    if (documentExpires != null) {
        defaultDataMap.put(fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // segment
    defaultDataMap.put(fessConfig.getIndexFieldSegment(), initParamMap.get(Constants.SESSION_ID));
    // created
    defaultDataMap.put(fessConfig.getIndexFieldCreated(), systemHelper.getCurrentTime());
    // boost
    defaultDataMap.put(fessConfig.getIndexFieldBoost(), config.getBoost().toString());
    // label: labelType
    final List<String> labelTypeList = new ArrayList<>();
    for (final String labelType : config.getLabelTypeValues()) {
        labelTypeList.add(labelType);
    }
    defaultDataMap.put(fessConfig.getIndexFieldLabel(), labelTypeList);
    // role: roleType
    final List<String> roleTypeList = new ArrayList<>();
    stream(config.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    defaultDataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
    // mimetype
    defaultDataMap.put(fessConfig.getIndexFieldMimetype(), mimeType);
    // title
    // content
    // cache
    // digest
    // host
    // site
    // url
    // anchor
    // content_length
    // last_modified
    // id
    storeData(config, callback, paramMap, configScriptMap, defaultDataMap);
}
Also used : DataConfig(org.codelibs.fess.es.config.exentity.DataConfig) Constants(org.codelibs.fess.Constants) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) Date(java.util.Date) StringUtil(org.codelibs.core.lang.StringUtil) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) IndexUpdateCallback(org.codelibs.fess.ds.IndexUpdateCallback) ArrayList(java.util.ArrayList) DataStore(org.codelibs.fess.ds.DataStore) List(java.util.List) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) GroovyUtil(org.codelibs.fess.util.GroovyUtil) Map(java.util.Map) SystemHelper(org.codelibs.fess.helper.SystemHelper) HashMap(java.util.HashMap) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) ArrayList(java.util.ArrayList) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date)

Example 3 with CrawlingInfoHelper

use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.

the class Crawler method doCrawl.

public int doCrawl(final Options options) {
    if (logger.isInfoEnabled()) {
        logger.info("Starting Crawler..");
    }
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final long totalTime = System.currentTimeMillis();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    boolean completed = false;
    try {
        writeTimeToSessionInfo(crawlingInfoHelper, Constants.CRAWLER_START_TIME);
        // setup path mapping
        final List<String> ptList = new ArrayList<>();
        ptList.add(Constants.PROCESS_TYPE_CRAWLING);
        ptList.add(Constants.PROCESS_TYPE_BOTH);
        pathMappingHelper.setPathMappingList(options.sessionId, pathMappingService.getPathMappingList(ptList));
        // duplicate host
        try {
            final DuplicateHostHelper duplicateHostHelper = ComponentUtil.getDuplicateHostHelper();
            duplicateHostHelper.init();
        } catch (final Exception e) {
            logger.warn("Could not initialize duplicateHostHelper.", e);
        }
        // delete expired sessions
        crawlingInfoService.deleteSessionIdsBefore(options.sessionId, options.name, ComponentUtil.getSystemHelper().getCurrentTimeAsLong());
        final List<String> webConfigIdList = options.getWebConfigIdList();
        final List<String> fileConfigIdList = options.getFileConfigIdList();
        final List<String> dataConfigIdList = options.getDataConfigIdList();
        final boolean runAll = webConfigIdList == null && fileConfigIdList == null && dataConfigIdList == null;
        Thread webFsCrawlerThread = null;
        Thread dataCrawlerThread = null;
        if (runAll || webConfigIdList != null || fileConfigIdList != null) {
            webFsCrawlerThread = new Thread((Runnable) () -> {
                writeTimeToSessionInfo(crawlingInfoHelper, Constants.WEB_FS_CRAWLER_START_TIME);
                webFsIndexHelper.crawl(options.sessionId, webConfigIdList, fileConfigIdList);
                writeTimeToSessionInfo(crawlingInfoHelper, Constants.WEB_FS_CRAWLER_END_TIME);
            }, WEB_FS_CRAWLING_PROCESS);
            webFsCrawlerThread.start();
        }
        if (runAll || dataConfigIdList != null) {
            dataCrawlerThread = new Thread((Runnable) () -> {
                writeTimeToSessionInfo(crawlingInfoHelper, Constants.DATA_CRAWLER_START_TIME);
                dataIndexHelper.crawl(options.sessionId, dataConfigIdList);
                writeTimeToSessionInfo(crawlingInfoHelper, Constants.DATA_CRAWLER_END_TIME);
            }, DATA_CRAWLING_PROCESS);
            dataCrawlerThread.start();
        }
        joinCrawlerThread(webFsCrawlerThread);
        joinCrawlerThread(dataCrawlerThread);
        if (logger.isInfoEnabled()) {
            logger.info("Finished Crawler");
        }
        completed = true;
        return Constants.EXIT_OK;
    } catch (final Throwable t) {
        logger.warn("An exception occurs on the crawl task.", t);
        return Constants.EXIT_FAIL;
    } finally {
        pathMappingHelper.removePathMappingList(options.sessionId);
        crawlingInfoHelper.putToInfoMap(Constants.CRAWLER_STATUS, completed ? Constants.T.toString() : Constants.F.toString());
        writeTimeToSessionInfo(crawlingInfoHelper, Constants.CRAWLER_END_TIME);
        crawlingInfoHelper.putToInfoMap(Constants.CRAWLER_EXEC_TIME, Long.toString(System.currentTimeMillis() - totalTime));
    }
}
Also used : CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) ArrayList(java.util.ArrayList) DuplicateHostHelper(org.codelibs.fess.helper.DuplicateHostHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) IOException(java.io.IOException) CmdLineException(org.kohsuke.args4j.CmdLineException)

Example 4 with CrawlingInfoHelper

use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.

the class IndexUpdateCallbackImpl method store.

/* (non-Javadoc)
     * @see org.codelibs.fess.ds.impl.IndexUpdateCallback#store(java.util.Map)
     */
@Override
public void store(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
    final long startTime = System.currentTimeMillis();
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
    if (logger.isDebugEnabled()) {
        logger.debug("Adding " + dataMap);
    }
    //   required check
    final Object urlObj = dataMap.get(fessConfig.getIndexFieldUrl());
    if (urlObj == null) {
        throw new DataStoreException("url is null. dataMap=" + dataMap);
    }
    final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    dataMap.put(fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
    if (fessConfig.getIndexerClickCountEnabledAsBoolean()) {
        addClickCountField(dataMap, url, fessConfig.getIndexFieldClickCount());
    }
    if (fessConfig.getIndexerFavoriteCountEnabledAsBoolean()) {
        addFavoriteCountField(dataMap, url, fessConfig.getIndexFieldFavoriteCount());
    }
    if (!dataMap.containsKey(fessConfig.getIndexFieldDocId())) {
        final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
        dataMap.put(fessConfig.getIndexFieldDocId(), systemHelper.generateDocId(dataMap));
    }
    synchronized (docList) {
        docList.add(dataMap);
        if (logger.isDebugEnabled()) {
            logger.debug("Added the document. " + "The number of a document cache is " + docList.size() + ".");
        }
        final Long contentLength = DocumentUtil.getValue(dataMap, fessConfig.getIndexFieldContentLength(), Long.class);
        if (contentLength != null) {
            docList.addContentSize(contentLength.longValue());
            if (docList.getContentSize() >= maxDocumentRequestSize) {
                indexingHelper.sendDocuments(fessEsClient, docList);
            }
        } else if (docList.size() >= fessConfig.getIndexerDataMaxDocumentCacheSizeAsInteger().intValue()) {
            indexingHelper.sendDocuments(fessEsClient, docList);
        }
        executeTime += System.currentTimeMillis() - startTime;
    }
    documentSize.getAndIncrement();
    if (logger.isDebugEnabled()) {
        logger.debug("The number of an added document is " + documentSize.get() + ".");
    }
}
Also used : DataStoreException(org.codelibs.fess.exception.DataStoreException) SystemHelper(org.codelibs.fess.helper.SystemHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) FessEsClient(org.codelibs.fess.es.client.FessEsClient) AtomicLong(java.util.concurrent.atomic.AtomicLong) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig)

Example 5 with CrawlingInfoHelper

use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.

the class FessCrawlerThread method isContentUpdated.

@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
    if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
        final long startTime = System.currentTimeMillis();
        final FessConfig fessConfig = ComponentUtil.getFessConfig();
        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
        final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
        final SambaHelper sambaHelper = ComponentUtil.getSambaHelper();
        final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
        final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
        final String url = urlQueue.getUrl();
        ResponseData responseData = null;
        try {
            final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.put(fessConfig.getIndexFieldUrl(), url);
            final List<String> roleTypeList = new ArrayList<>();
            stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
            if (url.startsWith("smb://")) {
                if (url.endsWith("/")) {
                    // directory
                    return true;
                }
                if (fessConfig.isSmbRoleFromFile()) {
                    // head method
                    responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                    if (responseData == null) {
                        return true;
                    }
                    final ACE[] aces = (ACE[]) responseData.getMetaDataMap().get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES);
                    if (aces != null) {
                        for (final ACE item : aces) {
                            final SID sid = item.getSID();
                            final String accountId = sambaHelper.getAccountId(sid);
                            if (accountId != null) {
                                roleTypeList.add(accountId);
                            }
                        }
                        if (logger.isDebugEnabled()) {
                            logger.debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString());
                        }
                    }
                }
            }
            dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
            final String id = crawlingInfoHelper.generateId(dataMap);
            if (logger.isDebugEnabled()) {
                logger.debug("Searching indexed document: " + id);
            }
            final Map<String, Object> document = indexingHelper.getDocument(fessEsClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
            if (document == null) {
                storeChildUrlsToQueue(urlQueue, getChildUrlSet(fessEsClient, id));
                return true;
            }
            final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
            if (expires != null && expires.getTime() < System.currentTimeMillis()) {
                final Object idValue = document.get(fessConfig.getIndexFieldId());
                if (idValue != null && !indexingHelper.deleteDocument(fessEsClient, idValue.toString())) {
                    logger.debug("Failed to delete expired document: " + url);
                }
                return true;
            }
            final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
            if (lastModified == null) {
                return true;
            }
            urlQueue.setLastModified(lastModified.getTime());
            log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
            if (responseData == null) {
                // head method
                responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                if (responseData == null) {
                    return true;
                }
            }
            final int httpStatusCode = responseData.getHttpStatusCode();
            if (logger.isDebugEnabled()) {
                logger.debug("Accessing document: " + url + ", status: " + httpStatusCode);
            }
            if (httpStatusCode == 404) {
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                if (!indexingHelper.deleteDocument(fessEsClient, id)) {
                    logger.debug("Failed to delete 404 document: " + url);
                }
                return false;
            } else if (responseData.getLastModified() == null) {
                return true;
            } else if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
                log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
                responseData.setExecutionTime(System.currentTimeMillis() - startTime);
                responseData.setParentUrl(urlQueue.getParentUrl());
                responseData.setSessionId(crawlerContext.getSessionId());
                responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
                processResponse(urlQueue, responseData);
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
                if (documentExpires != null && !indexingHelper.updateDocument(fessEsClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
                    logger.debug("Failed to update " + fessConfig.getIndexFieldExpires() + " at " + url);
                }
                return false;
            }
        } finally {
            if (responseData != null) {
                IOUtils.closeQuietly(responseData);
            }
        }
    }
    return true;
}
Also used : DocumentUtil(org.codelibs.fess.util.DocumentUtil) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) LinkedHashSet(java.util.LinkedHashSet) SambaHelper(org.codelibs.fess.helper.SambaHelper) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) LogType(org.codelibs.fess.crawler.log.LogType) Logger(org.slf4j.Logger) FessEsClient(org.codelibs.fess.es.client.FessEsClient) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) StringUtil(org.codelibs.core.lang.StringUtil) SID(jcifs.smb.SID) Set(java.util.Set) ContentNotFoundException(org.codelibs.fess.exception.ContentNotFoundException) Collectors(java.util.stream.Collectors) IOUtils(org.apache.commons.io.IOUtils) List(java.util.List) ACE(jcifs.smb.ACE) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) SmbClient(org.codelibs.fess.crawler.client.smb.SmbClient) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ACE(jcifs.smb.ACE) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) HashMap(java.util.HashMap) SambaHelper(org.codelibs.fess.helper.SambaHelper) FessEsClient(org.codelibs.fess.es.client.FessEsClient) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ArrayList(java.util.ArrayList) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date) SID(jcifs.smb.SID) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper)

Aggregations

CrawlingInfoHelper (org.codelibs.fess.helper.CrawlingInfoHelper)7 ArrayList (java.util.ArrayList)5 Date (java.util.Date)5 HashMap (java.util.HashMap)5 Map (java.util.Map)5 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)5 List (java.util.List)4 StringUtil (org.codelibs.core.lang.StringUtil)4 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)4 SystemHelper (org.codelibs.fess.helper.SystemHelper)4 ComponentUtil (org.codelibs.fess.util.ComponentUtil)4 Logger (org.slf4j.Logger)4 LoggerFactory (org.slf4j.LoggerFactory)4 HashSet (java.util.HashSet)3 Set (java.util.Set)3 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)3 UrlQueue (org.codelibs.fess.crawler.entity.UrlQueue)3 IOException (java.io.IOException)2 ACE (jcifs.smb.ACE)2 SID (jcifs.smb.SID)2