Search in sources :

Example 1 with CrawlingConfig

use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.

the class FessXpathTransformer method putAdditionalData.

protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
    // canonical
    if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
        final String canonicalUrl = getCanonicalUrl(responseData, document);
        if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) {
            final Set<RequestData> childUrlSet = new HashSet<>();
            childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
            throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
        }
    }
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
    final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
    final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
    String url = responseData.getUrl();
    final String indexingTarget = crawlingConfig.getIndexingTarget(url);
    url = pathMappingHelper.replaceUrl(sessionId, url);
    final String mimeType = responseData.getMimeType();
    final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
    final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
    String urlEncoding;
    final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
    if (urlQueue != null && urlQueue.getEncoding() != null) {
        urlEncoding = urlQueue.getEncoding();
    } else {
        urlEncoding = responseData.getCharSet();
    }
    // cid
    final String configId = crawlingConfig.getConfigId();
    if (configId != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
    }
    //  expires
    if (documentExpires != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // lang
    final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
    if (lang != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
    }
    // title
    // content
    final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
    putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap));
    if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
        if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
            String charSet = responseData.getCharSet();
            if (charSet == null) {
                charSet = Constants.UTF_8;
            }
            try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
                // cache
                putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
                putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
            } catch (final Exception e) {
                logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);
            }
        } else {
            logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
        }
    }
    // digest
    final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
    if (StringUtil.isNotBlank(digest)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
    } else {
        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
    }
    // segment
    putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
    // host
    putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
    // site
    putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
    // filename
    final String fileName = getFileName(url, urlEncoding);
    if (StringUtil.isNotBlank(fileName)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
    }
    // url
    putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    // created
    final Date now = systemHelper.getCurrentTime();
    putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
    // anchor
    putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
    // mimetype
    putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
    if (fileTypeHelper != null) {
        // filetype
        putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
    }
    // content_length
    putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
    // last_modified
    final Date lastModified = responseData.getLastModified();
    if (lastModified != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
    } else {
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
    }
    // indexingTarget
    putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
    //  boost
    putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
    // label: labelType
    final Set<String> labelTypeSet = new HashSet<>();
    for (final String labelType : crawlingConfig.getLabelTypeValues()) {
        labelTypeSet.add(labelType);
    }
    labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
    putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
    // role: roleType
    final List<String> roleTypeList = new ArrayList<>();
    stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
    // id
    putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    // parentId
    String parentUrl = responseData.getParentUrl();
    if (StringUtil.isNotBlank(parentUrl)) {
        parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
        // set again
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    }
    // thumbnail
    final String thumbnailUrl = getThumbnailUrl(responseData, document);
    if (StringUtil.isNotBlank(thumbnailUrl)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
    }
    // from config
    final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
    xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
        final String key = e.getKey();
        final String value = getSingleNodeValue(document, e.getValue(), true);
        putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
    });
    crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
        final String key = e.getKey();
        final String value = e.getValue();
        putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
    });
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) BufferedInputStream(java.io.BufferedInputStream) URL(java.net.URL) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Locale(java.util.Locale) DOMParser(org.cyberneko.html.parsers.DOMParser) Document(org.w3c.dom.Document) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) InputStreamUtil(org.codelibs.core.io.InputStreamUtil) ResultData(org.codelibs.fess.crawler.entity.ResultData) Set(java.util.Set) List(java.util.List) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) PostConstruct(javax.annotation.PostConstruct) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) XObject(org.apache.xpath.objects.XObject) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) TransformerException(javax.xml.transform.TransformerException) HashMap(java.util.HashMap) SerializeUtil(org.codelibs.core.io.SerializeUtil) CrawlingParameterUtil(org.codelibs.fess.crawler.util.CrawlingParameterUtil) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) XpathTransformer(org.codelibs.fess.crawler.transformer.impl.XpathTransformer) Node(org.w3c.dom.Node) PrunedTag(org.codelibs.fess.util.PrunedTag) NamedNodeMap(org.w3c.dom.NamedNodeMap) InputSource(org.xml.sax.InputSource) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) NodeList(org.w3c.dom.NodeList) MalformedURLException(java.net.MalformedURLException) StringUtil(org.codelibs.core.lang.StringUtil) DuplicateHostHelper(org.codelibs.fess.helper.DuplicateHostHelper) ValueHolder(org.codelibs.core.misc.ValueHolder) SystemHelper(org.codelibs.fess.helper.SystemHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Collections(java.util.Collections) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) TransformerException(javax.xml.transform.TransformerException) MalformedURLException(java.net.MalformedURLException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) BufferedInputStream(java.io.BufferedInputStream) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) HashSet(java.util.HashSet)

Example 2 with CrawlingConfig

use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.

the class FessCrawlerThread method processResponse.

@Override
protected void processResponse(final UrlQueue<?> urlQueue, final ResponseData responseData) {
    super.processResponse(urlQueue, responseData);
    FessConfig fessConfig = ComponentUtil.getFessConfig();
    if (fessConfig.isCrawlerFailureUrlStatusCodes(responseData.getHttpStatusCode())) {
        String sessionId = crawlerContext.getSessionId();
        final CrawlingConfig crawlingConfig = ComponentUtil.getCrawlingConfigHelper().get(sessionId);
        final String url = urlQueue.getUrl();
        final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
        failureUrlService.store(crawlingConfig, ContentNotFoundException.class.getCanonicalName(), url, new ContentNotFoundException(url));
    }
}
Also used : CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ContentNotFoundException(org.codelibs.fess.exception.ContentNotFoundException) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig)

Example 3 with CrawlingConfig

use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.

the class CrawlerLogHelper method storeFailureUrl.

private void storeFailureUrl(final CrawlerContext crawlerContext, final UrlQueue urlQueue, final String errorName, final Throwable e) {
    final CrawlingConfig crawlingConfig = getCrawlingConfig(crawlerContext.getSessionId());
    final String url = urlQueue.getUrl();
    final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
    failureUrlService.store(crawlingConfig, errorName, url, e);
}
Also used : CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService)

Example 4 with CrawlingConfig

use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.

the class GitBucketDataStoreImpl method storeData.

@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
    final String rootURL = getRootURL(paramMap);
    final String authToken = getAuthToken(paramMap);
    final long readInterval = getReadInterval(paramMap);
    // Non-emptiness Check for URL and Token
    if (rootURL.isEmpty() || authToken.isEmpty()) {
        logger.warn("parameter \"" + TOKEN_PARAM + "\" and \"" + GITBUCKET_URL_PARAM + "\" are required");
        return;
    }
    // Get List of Repositories
    final List<Map<String, Object>> repositoryList = getRepositoryList(rootURL, authToken);
    if (repositoryList.isEmpty()) {
        logger.warn("Token is invalid or no Repository");
        return;
    }
    // Get Labels
    final Map<String, String> pluginInfo = getFessPluginInfo(rootURL, authToken);
    final String sourceLabel = pluginInfo.get("source_label");
    final String issueLabel = pluginInfo.get("issue_label");
    final String wikiLabel = pluginInfo.get("wiki_label");
    final CrawlingConfig crawlingConfig = new CrawlingConfigWrapper(dataConfig) {

        @Override
        public Map<String, Object> initializeClientFactory(final CrawlerClientFactory crawlerClientFactory) {
            final Map<String, Object> paramMap = super.initializeClientFactory(crawlerClientFactory);
            final List<RequestHeader> headerList = new ArrayList<>();
            final RequestHeader[] headers = (RequestHeader[]) paramMap.get(HcHttpClient.REQUERT_HEADERS_PROPERTY);
            if (headers != null) {
                for (final RequestHeader header : headers) {
                    headerList.add(header);
                }
            }
            headerList.add(new RequestHeader("Authorization", "token " + authToken));
            headerList.add(new RequestHeader("Accept", "application/vnd.github.v3.raw"));
            paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY, headerList.toArray(new RequestHeader[headerList.size()]));
            return paramMap;
        }
    };
    // Crawl each repository
    for (final Map<String, Object> repository : repositoryList) {
        try {
            final String owner = (String) repository.get("owner");
            final String name = (String) repository.get("name");
            final String refStr = getGitRef(rootURL, authToken, owner, name, "master");
            final int issueCount = (int) repository.get("issue_count");
            final int pullCount = (int) repository.get("pull_count");
            final List<String> roleList = createRoleList(owner, repository);
            logger.info("Crawl " + owner + "/" + name);
            // crawl and store file contents recursively
            crawlFileContents(rootURL, authToken, owner, name, refStr, StringUtil.EMPTY, 0, readInterval, path -> {
                storeFileContent(rootURL, authToken, sourceLabel, owner, name, refStr, roleList, path, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap);
                if (readInterval > 0) {
                    sleep(readInterval);
                }
            });
            logger.info("Crawl issues in " + owner + "/" + name);
            // store issues
            for (int issueId = 1; issueId <= issueCount + pullCount; issueId++) {
                storeIssueById(rootURL, authToken, issueLabel, owner, name, new Integer(issueId), roleList, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap);
                if (readInterval > 0) {
                    sleep(readInterval);
                }
            }
            logger.info("Crawl Wiki in " + owner + "/" + name);
            // crawl Wiki
            storeWikiContents(rootURL, authToken, wikiLabel, owner, name, roleList, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap, readInterval);
        } catch (final Exception e) {
            logger.warn("Failed to access to " + repository, e);
        }
    }
}
Also used : CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) CrawlingConfigWrapper(org.codelibs.fess.es.config.exentity.CrawlingConfigWrapper) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ArrayList(java.util.ArrayList) URISyntaxException(java.net.URISyntaxException) RequestHeader(org.codelibs.fess.crawler.client.http.RequestHeader) HashMap(java.util.HashMap) Map(java.util.Map)

Example 5 with CrawlingConfig

use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.

the class FessCrawlerThread method isContentUpdated.

@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
    if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
        final long startTime = System.currentTimeMillis();
        final FessConfig fessConfig = ComponentUtil.getFessConfig();
        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
        final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
        final SambaHelper sambaHelper = ComponentUtil.getSambaHelper();
        final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
        final FessEsClient fessEsClient = ComponentUtil.getFessEsClient();
        final String url = urlQueue.getUrl();
        ResponseData responseData = null;
        try {
            final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.put(fessConfig.getIndexFieldUrl(), url);
            final List<String> roleTypeList = new ArrayList<>();
            stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
            if (url.startsWith("smb://")) {
                if (url.endsWith("/")) {
                    // directory
                    return true;
                }
                if (fessConfig.isSmbRoleFromFile()) {
                    // head method
                    responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                    if (responseData == null) {
                        return true;
                    }
                    final ACE[] aces = (ACE[]) responseData.getMetaDataMap().get(SmbClient.SMB_ACCESS_CONTROL_ENTRIES);
                    if (aces != null) {
                        for (final ACE item : aces) {
                            final SID sid = item.getSID();
                            final String accountId = sambaHelper.getAccountId(sid);
                            if (accountId != null) {
                                roleTypeList.add(accountId);
                            }
                        }
                        if (logger.isDebugEnabled()) {
                            logger.debug("smbUrl:" + responseData.getUrl() + " roleType:" + roleTypeList.toString());
                        }
                    }
                }
            }
            dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
            final String id = crawlingInfoHelper.generateId(dataMap);
            if (logger.isDebugEnabled()) {
                logger.debug("Searching indexed document: " + id);
            }
            final Map<String, Object> document = indexingHelper.getDocument(fessEsClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
            if (document == null) {
                storeChildUrlsToQueue(urlQueue, getChildUrlSet(fessEsClient, id));
                return true;
            }
            final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
            if (expires != null && expires.getTime() < System.currentTimeMillis()) {
                final Object idValue = document.get(fessConfig.getIndexFieldId());
                if (idValue != null && !indexingHelper.deleteDocument(fessEsClient, idValue.toString())) {
                    logger.debug("Failed to delete expired document: " + url);
                }
                return true;
            }
            final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
            if (lastModified == null) {
                return true;
            }
            urlQueue.setLastModified(lastModified.getTime());
            log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
            if (responseData == null) {
                // head method
                responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                if (responseData == null) {
                    return true;
                }
            }
            final int httpStatusCode = responseData.getHttpStatusCode();
            if (logger.isDebugEnabled()) {
                logger.debug("Accessing document: " + url + ", status: " + httpStatusCode);
            }
            if (httpStatusCode == 404) {
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                if (!indexingHelper.deleteDocument(fessEsClient, id)) {
                    logger.debug("Failed to delete 404 document: " + url);
                }
                return false;
            } else if (responseData.getLastModified() == null) {
                return true;
            } else if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
                log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
                responseData.setExecutionTime(System.currentTimeMillis() - startTime);
                responseData.setParentUrl(urlQueue.getParentUrl());
                responseData.setSessionId(crawlerContext.getSessionId());
                responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
                processResponse(urlQueue, responseData);
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
                if (documentExpires != null && !indexingHelper.updateDocument(fessEsClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
                    logger.debug("Failed to update " + fessConfig.getIndexFieldExpires() + " at " + url);
                }
                return false;
            }
        } finally {
            if (responseData != null) {
                IOUtils.closeQuietly(responseData);
            }
        }
    }
    return true;
}
Also used : DocumentUtil(org.codelibs.fess.util.DocumentUtil) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) LinkedHashSet(java.util.LinkedHashSet) SambaHelper(org.codelibs.fess.helper.SambaHelper) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) LogType(org.codelibs.fess.crawler.log.LogType) Logger(org.slf4j.Logger) FessEsClient(org.codelibs.fess.es.client.FessEsClient) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) StringUtil(org.codelibs.core.lang.StringUtil) SID(jcifs.smb.SID) Set(java.util.Set) ContentNotFoundException(org.codelibs.fess.exception.ContentNotFoundException) Collectors(java.util.stream.Collectors) IOUtils(org.apache.commons.io.IOUtils) List(java.util.List) ACE(jcifs.smb.ACE) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) SmbClient(org.codelibs.fess.crawler.client.smb.SmbClient) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ACE(jcifs.smb.ACE) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) HashMap(java.util.HashMap) SambaHelper(org.codelibs.fess.helper.SambaHelper) FessEsClient(org.codelibs.fess.es.client.FessEsClient) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ArrayList(java.util.ArrayList) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date) SID(jcifs.smb.SID) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper)

Aggregations

CrawlingConfig (org.codelibs.fess.es.config.exentity.CrawlingConfig)8 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)6 ArrayList (java.util.ArrayList)5 HashMap (java.util.HashMap)4 List (java.util.List)4 Map (java.util.Map)4 StringUtil (org.codelibs.core.lang.StringUtil)4 Date (java.util.Date)3 HashSet (java.util.HashSet)3 Set (java.util.Set)3 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)3 Constants (org.codelibs.fess.Constants)3 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)3 UrlQueue (org.codelibs.fess.crawler.entity.UrlQueue)3 CrawlingConfigHelper (org.codelibs.fess.helper.CrawlingConfigHelper)3 CrawlingInfoHelper (org.codelibs.fess.helper.CrawlingInfoHelper)3 ComponentUtil (org.codelibs.fess.util.ComponentUtil)3 Collections (java.util.Collections)2 ACE (jcifs.smb.ACE)2 SID (jcifs.smb.SID)2