Search in sources :

Example 1 with CrawlingConfigHelper

use of org.codelibs.fess.helper.CrawlingConfigHelper in project fess by codelibs.

the class BaseThumbnailGenerator method process.

protected boolean process(final String id, final Predicate<ResponseData> consumer) {
    return process(id, (configId, url) -> {
        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
        final CrawlingConfig config = crawlingConfigHelper.getCrawlingConfig(configId);
        if (config == null) {
            throw new ThumbnailGenerationException("No CrawlingConfig: " + configId);
        }
        if (logger.isInfoEnabled()) {
            logger.info("Generating Thumbnail: {}", url);
        }
        final CrawlerClientFactory crawlerClientFactory = config.initializeClientFactory(() -> ComponentUtil.getComponent(CrawlerClientFactory.class));
        final CrawlerClient client = crawlerClientFactory.getClient(url);
        if (client == null) {
            throw new ThumbnailGenerationException("No CrawlerClient: " + configId + ", url: " + url);
        }
        String u = url;
        for (int i = 0; i < maxRedirectCount; i++) {
            try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(u).build())) {
                if (StringUtil.isNotBlank(responseData.getRedirectLocation())) {
                    u = responseData.getRedirectLocation();
                    continue;
                }
                if (StringUtil.isBlank(responseData.getUrl())) {
                    throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url + " (Response URL is empty)");
                }
                return consumer.test(responseData);
            } catch (final CrawlingAccessException e) {
                if (logger.isDebugEnabled()) {
                    throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url, e);
                }
                throw new ThumbnailGenerationException(e.getMessage());
            } catch (final Exception e) {
                throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url, e);
            }
        }
        throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url + " (Redirect Loop)");
    });
}
Also used : CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ThumbnailGenerationException(org.codelibs.fess.exception.ThumbnailGenerationException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) ThumbnailGenerationException(org.codelibs.fess.exception.ThumbnailGenerationException)

Example 2 with CrawlingConfigHelper

use of org.codelibs.fess.helper.CrawlingConfigHelper in project fess by codelibs.

the class FessXpathTransformer method putAdditionalData.

protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
    // canonical
    final String canonicalUrl = getCanonicalUrl(responseData, document);
    if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl) && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
        final Set<RequestData> childUrlSet = new HashSet<>();
        childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
        logger.info("CANONICAL: {} -> {}", responseData.getUrl(), canonicalUrl);
        throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData");
    }
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
    final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
    final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
    String url = responseData.getUrl();
    final String indexingTarget = crawlingConfig.getIndexingTarget(url);
    url = pathMappingHelper.replaceUrl(sessionId, url);
    final String mimeType = responseData.getMimeType();
    final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
    final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
    String urlEncoding;
    final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
    if (urlQueue != null && urlQueue.getEncoding() != null) {
        urlEncoding = urlQueue.getEncoding();
    } else {
        urlEncoding = responseData.getCharSet();
    }
    // cid
    final String configId = crawlingConfig.getConfigId();
    if (configId != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
    }
    // expires
    if (documentExpires != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // lang
    final String lang = systemHelper.normalizeHtmlLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
    if (lang != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
    }
    // title
    // content
    final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
    putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(crawlingConfig, responseData, body, dataMap));
    if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
        if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
            String charSet = responseData.getCharSet();
            if (charSet == null) {
                charSet = Constants.UTF_8;
            }
            try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
                // cache
                putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
                putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
            } catch (final Exception e) {
                logger.warn("Failed to write a cache: {}:{}", sessionId, responseData, e);
            }
        } else {
            logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
        }
    }
    // digest
    final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
    if (StringUtil.isNotBlank(digest)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
    } else {
        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
    }
    // segment
    putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
    // host
    putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
    // site
    putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
    // filename
    final String fileName = getFileName(url, urlEncoding);
    if (StringUtil.isNotBlank(fileName)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
    }
    // url
    putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    // created
    final Date now = systemHelper.getCurrentTime();
    putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
    // anchor
    putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
    // mimetype
    putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
    if (fileTypeHelper != null) {
        // filetype
        putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
    }
    // content_length
    putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
    // last_modified
    final Date lastModified = responseData.getLastModified();
    if (lastModified != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
    } else {
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
    }
    // indexingTarget
    putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
    // boost
    putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
    // label: labelType
    putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeHelper.getMatchedLabelValueSet(url));
    // role: roleType
    final List<String> roleTypeList = new ArrayList<>();
    stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
    // virtualHosts
    putResultDataBody(dataMap, fessConfig.getIndexFieldVirtualHost(), stream(crawlingConfig.getVirtualHosts()).get(stream -> stream.filter(StringUtil::isNotBlank).collect(Collectors.toList())));
    // id
    putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    // parentId
    String parentUrl = responseData.getParentUrl();
    if (StringUtil.isNotBlank(parentUrl)) {
        parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
        // set again
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    }
    // thumbnail
    final String thumbnailUrl = getThumbnailUrl(responseData, document);
    if (StringUtil.isNotBlank(thumbnailUrl)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
    }
    // from config
    final String scriptType = crawlingConfig.getScriptType();
    final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
    xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
        final String key = e.getKey();
        final String value = getSingleNodeValue(document, e.getValue(), true);
        putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
    });
    crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
        final String key = e.getKey();
        final String value = e.getValue();
        putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
    });
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) BufferedInputStream(java.io.BufferedInputStream) URL(java.net.URL) Date(java.util.Date) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Locale(java.util.Locale) Document(org.w3c.dom.Document) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) Config(org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.Config) InputStreamUtil(org.codelibs.core.io.InputStreamUtil) ResultData(org.codelibs.fess.crawler.entity.ResultData) Set(java.util.Set) Collectors(java.util.stream.Collectors) List(java.util.List) Logger(org.apache.logging.log4j.Logger) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) PostConstruct(javax.annotation.PostConstruct) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) XObject(org.apache.xpath.objects.XObject) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) TransformerException(javax.xml.transform.TransformerException) HashMap(java.util.HashMap) SerializeUtil(org.codelibs.core.io.SerializeUtil) CrawlingParameterUtil(org.codelibs.fess.crawler.util.CrawlingParameterUtil) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) XpathTransformer(org.codelibs.fess.crawler.transformer.impl.XpathTransformer) Node(org.w3c.dom.Node) PrunedTag(org.codelibs.fess.util.PrunedTag) NamedNodeMap(org.w3c.dom.NamedNodeMap) InputSource(org.xml.sax.InputSource) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) XPath(org.codelibs.fess.es.config.exentity.CrawlingConfig.Param.XPath) NodeList(org.w3c.dom.NodeList) MalformedURLException(java.net.MalformedURLException) StringUtil(org.codelibs.core.lang.StringUtil) DuplicateHostHelper(org.codelibs.fess.helper.DuplicateHostHelper) ValueHolder(org.codelibs.core.misc.ValueHolder) SystemHelper(org.codelibs.fess.helper.SystemHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) DOMParser(org.codelibs.nekohtml.parsers.DOMParser) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Collections(java.util.Collections) LogManager(org.apache.logging.log4j.LogManager) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) TransformerException(javax.xml.transform.TransformerException) MalformedURLException(java.net.MalformedURLException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) BufferedInputStream(java.io.BufferedInputStream) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) HashSet(java.util.HashSet)

Example 3 with CrawlingConfigHelper

use of org.codelibs.fess.helper.CrawlingConfigHelper in project fess by codelibs.

the class AbstractFessFileTransformer method generateData.

protected Map<String, Object> generateData(final ResponseData responseData) {
    final Extractor extractor = getExtractor(responseData);
    final Map<String, String> params = new HashMap<>();
    params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
    final String mimeType = responseData.getMimeType();
    params.put(HttpHeaders.CONTENT_TYPE, mimeType);
    params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
    final StringBuilder contentMetaBuf = new StringBuilder(1000);
    final Map<String, Object> dataMap = new HashMap<>();
    final Map<String, Object> metaDataMap = new HashMap<>();
    String content;
    try (final InputStream in = responseData.getResponseBody()) {
        final ExtractData extractData = getExtractData(extractor, in, params);
        content = extractData.getContent();
        if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
            return null;
        }
        if (getLogger().isDebugEnabled()) {
            getLogger().debug("ExtractData: " + extractData);
        }
        // meta
        //
        extractData.getKeySet().stream().filter(//
        k -> extractData.getValues(k) != null).forEach(key -> {
            final String[] values = extractData.getValues(key);
            metaDataMap.put(key, values);
            if (fessConfig.isCrawlerMetadataContentIncluded(key)) {
                final String joinedValue = StringUtils.join(values, ' ');
                if (StringUtil.isNotBlank(joinedValue)) {
                    if (contentMetaBuf.length() > 0) {
                        contentMetaBuf.append(' ');
                    }
                    contentMetaBuf.append(joinedValue.trim());
                }
            }
            final Pair<String, String> mapping = fessConfig.getCrawlerMetadataNameMapping(key);
            if (mapping != null) {
                if (Constants.MAPPING_TYPE_ARRAY.equalsIgnoreCase(mapping.getSecond())) {
                    dataMap.put(mapping.getFirst(), values);
                } else if (Constants.MAPPING_TYPE_STRING.equalsIgnoreCase(mapping.getSecond())) {
                    final String joinedValue = StringUtils.join(values, ' ');
                    dataMap.put(mapping.getFirst(), joinedValue.trim());
                } else if (values.length == 1) {
                    try {
                        if (Constants.MAPPING_TYPE_LONG.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Long.parseLong(values[0]));
                        } else if (Constants.MAPPING_TYPE_DOUBLE.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Double.parseDouble(values[0]));
                        } else {
                            logger.warn("Unknown mapping type: {}={}", key, mapping);
                        }
                    } catch (final NumberFormatException e) {
                        logger.warn("Failed to parse " + values[0], e);
                    }
                }
            }
        });
    } catch (final Exception e) {
        final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
        rcae.setLogLevel(CrawlingAccessException.WARN);
        throw rcae;
    }
    if (content == null) {
        content = StringUtil.EMPTY;
    }
    final String contentMeta = contentMetaBuf.toString().trim();
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
    final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
    String url = responseData.getUrl();
    final String indexingTarget = crawlingConfig.getIndexingTarget(url);
    url = pathMappingHelper.replaceUrl(sessionId, url);
    final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
    String urlEncoding;
    final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
    if (urlQueue != null && urlQueue.getEncoding() != null) {
        urlEncoding = urlQueue.getEncoding();
    } else {
        urlEncoding = responseData.getCharSet();
    }
    // cid
    final String configId = crawlingConfig.getConfigId();
    if (configId != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
    }
    //  expires
    if (documentExpires != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // segment
    putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
    // content
    final StringBuilder buf = new StringBuilder(content.length() + 1000);
    if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) {
        buf.append(content);
    }
    if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) {
        if (buf.length() > 0) {
            buf.append(' ');
        }
        buf.append(contentMeta);
    }
    final String bodyBase = buf.toString().trim();
    final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
    putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
    if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
        if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
            final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
            // text cache
            putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
            putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
        }
    }
    // digest
    putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
    // title
    final String fileName = getFileName(url, urlEncoding);
    if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
        if (url.endsWith("/")) {
            if (StringUtil.isNotBlank(content)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
            }
        } else {
            if (StringUtil.isBlank(fileName)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
            }
        }
    }
    // host
    putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url));
    // site
    putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
    // filename
    if (StringUtil.isNotBlank(fileName)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
    }
    // url
    putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    // created
    final Date now = systemHelper.getCurrentTime();
    putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
    // TODO anchor
    putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), StringUtil.EMPTY);
    // mimetype
    putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
    if (fileTypeHelper != null) {
        // filetype
        putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
    }
    // content_length
    putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
    // last_modified
    final Date lastModified = responseData.getLastModified();
    if (lastModified != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
    } else {
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
    }
    // indexingTarget
    putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
    //  boost
    putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
    // label: labelType
    final Set<String> labelTypeSet = new HashSet<>();
    for (final String labelType : crawlingConfig.getLabelTypeValues()) {
        labelTypeSet.add(labelType);
    }
    final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
    labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
    putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
    // role: roleType
    final List<String> roleTypeList = getRoleTypes(responseData);
    stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
    // lang
    if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang());
    }
    // id
    putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    // parentId
    String parentUrl = responseData.getParentUrl();
    if (StringUtil.isNotBlank(parentUrl)) {
        parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
        // set again
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    }
    // from config
    final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
    final Map<String, String> metaConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.META);
    for (final Map.Entry<String, String> entry : metaConfigMap.entrySet()) {
        final String key = entry.getKey();
        final String[] values = entry.getValue().split(",");
        for (final String value : values) {
            putResultDataWithTemplate(dataMap, key, metaDataMap.get(value), scriptConfigMap.get(key));
        }
    }
    final Map<String, String> valueConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.VALUE);
    for (final Map.Entry<String, String> entry : valueConfigMap.entrySet()) {
        final String key = entry.getKey();
        putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key));
    }
    return dataMap;
}
Also used : CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) URLDecoder(java.net.URLDecoder) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) Pair(org.codelibs.core.misc.Pair) HashMap(java.util.HashMap) TikaMetadataKeys(org.apache.tika.metadata.TikaMetadataKeys) SerializeUtil(org.codelibs.core.io.SerializeUtil) CrawlingParameterUtil(org.codelibs.fess.crawler.util.CrawlingParameterUtil) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) StringUtils(org.apache.commons.lang3.StringUtils) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) AbstractTransformer(org.codelibs.fess.crawler.transformer.impl.AbstractTransformer) SambaHelper(org.codelibs.fess.helper.SambaHelper) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) ResultData(org.codelibs.fess.crawler.entity.ResultData) Extractor(org.codelibs.fess.crawler.extractor.Extractor) StringUtil(org.codelibs.core.lang.StringUtil) SID(jcifs.smb.SID) Set(java.util.Set) List(java.util.List) ACE(jcifs.smb.ACE) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) HttpHeaders(org.apache.tika.metadata.HttpHeaders) SmbClient(org.codelibs.fess.crawler.client.smb.SmbClient) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) HashMap(java.util.HashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) Extractor(org.codelibs.fess.crawler.extractor.Extractor) HashSet(java.util.HashSet) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) InputStream(java.io.InputStream) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Date(java.util.Date) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with CrawlingConfigHelper

use of org.codelibs.fess.helper.CrawlingConfigHelper in project fess by codelibs.

the class FessCrawlerThread method isContentUpdated.

@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
    if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
        final long startTime = System.currentTimeMillis();
        final FessConfig fessConfig = ComponentUtil.getFessConfig();
        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
        final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
        final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
        final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
        final String url = urlQueue.getUrl();
        ResponseData responseData = null;
        try {
            final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.put(fessConfig.getIndexFieldUrl(), url);
            final List<String> roleTypeList = new ArrayList<>();
            stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
            if (url.startsWith("smb:") || url.startsWith("smb1:") || url.startsWith("file:") || url.startsWith("ftp:")) {
                if (url.endsWith("/")) {
                    // directory
                    return true;
                }
                final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
                if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
                    // head method
                    responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                    if (responseData == null) {
                        return true;
                    }
                    roleTypeList.addAll(permissionHelper.getSmbRoleTypeList(responseData));
                    roleTypeList.addAll(permissionHelper.getFileRoleTypeList(responseData));
                    roleTypeList.addAll(permissionHelper.getFtpRoleTypeList(responseData));
                }
            }
            dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
            final String id = crawlingInfoHelper.generateId(dataMap);
            if (logger.isDebugEnabled()) {
                logger.debug("Searching indexed document: {}", id);
            }
            final Map<String, Object> document = indexingHelper.getDocument(searchEngineClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
            if (document == null) {
                storeChildUrlsToQueue(urlQueue, getChildUrlSet(searchEngineClient, id));
                return true;
            }
            final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
            if (expires != null && expires.getTime() < System.currentTimeMillis()) {
                final Object idValue = document.get(fessConfig.getIndexFieldId());
                if (idValue != null && !indexingHelper.deleteDocument(searchEngineClient, idValue.toString())) {
                    logger.debug("Failed to delete expired document: {}", url);
                }
                return true;
            }
            final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
            if (lastModified == null) {
                return true;
            }
            urlQueue.setLastModified(lastModified.getTime());
            log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
            if (responseData == null) {
                // head method
                responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                if (responseData == null) {
                    return true;
                }
            }
            final int httpStatusCode = responseData.getHttpStatusCode();
            if (logger.isDebugEnabled()) {
                logger.debug("Accessing document: {}, status: {}", url, httpStatusCode);
            }
            if (httpStatusCode == 404) {
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                if (!indexingHelper.deleteDocument(searchEngineClient, id)) {
                    logger.debug("Failed to delete 404 document: {}", url);
                }
                return false;
            }
            if (responseData.getLastModified() == null) {
                return true;
            }
            if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
                log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
                responseData.setExecutionTime(System.currentTimeMillis() - startTime);
                responseData.setParentUrl(urlQueue.getParentUrl());
                responseData.setSessionId(crawlerContext.getSessionId());
                responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
                processResponse(urlQueue, responseData);
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
                if (documentExpires != null && !indexingHelper.updateDocument(searchEngineClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
                    logger.debug("Failed to update {} at {}", fessConfig.getIndexFieldExpires(), url);
                }
                return false;
            }
        } finally {
            if (responseData != null) {
                CloseableUtil.closeQuietly(responseData);
            }
        }
    }
    return true;
}
Also used : DocumentUtil(org.codelibs.fess.util.DocumentUtil) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) Date(java.util.Date) HashMap(java.util.HashMap) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) PermissionHelper(org.codelibs.fess.helper.PermissionHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) LinkedHashSet(java.util.LinkedHashSet) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) LogType(org.codelibs.fess.crawler.log.LogType) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) StringUtil(org.codelibs.core.lang.StringUtil) Set(java.util.Set) ContentNotFoundException(org.codelibs.fess.exception.ContentNotFoundException) DuplicateHostHelper(org.codelibs.fess.helper.DuplicateHostHelper) Collectors(java.util.stream.Collectors) CloseableUtil(org.codelibs.core.io.CloseableUtil) List(java.util.List) Logger(org.apache.logging.log4j.Logger) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) LogManager(org.apache.logging.log4j.LogManager) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) HashMap(java.util.HashMap) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) ArrayList(java.util.ArrayList) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) PermissionHelper(org.codelibs.fess.helper.PermissionHelper)

Example 5 with CrawlingConfigHelper

use of org.codelibs.fess.helper.CrawlingConfigHelper in project fess by codelibs.

the class FessXpathTransformer method getConfigPrameterMap.

protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    return crawlingConfig.getConfigParameterMap(config);
}
Also used : CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig)

Aggregations

CrawlingConfig (org.codelibs.fess.es.config.exentity.CrawlingConfig)5 CrawlingConfigHelper (org.codelibs.fess.helper.CrawlingConfigHelper)5 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)4 ArrayList (java.util.ArrayList)3 Date (java.util.Date)3 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)3 List (java.util.List)3 Map (java.util.Map)3 Set (java.util.Set)3 StringUtil (org.codelibs.core.lang.StringUtil)3 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)3 UrlQueue (org.codelibs.fess.crawler.entity.UrlQueue)3 Collectors (java.util.stream.Collectors)2 LogManager (org.apache.logging.log4j.LogManager)2 Logger (org.apache.logging.log4j.Logger)2 SerializeUtil (org.codelibs.core.io.SerializeUtil)2 Constants (org.codelibs.fess.Constants)2 RequestDataBuilder (org.codelibs.fess.crawler.builder.RequestDataBuilder)2 CrawlerClient (org.codelibs.fess.crawler.client.CrawlerClient)2