Search in sources :

Example 1 with DocumentHelper

use of org.codelibs.fess.helper.DocumentHelper in project fess by codelibs.

the class FessXpathTransformer method putAdditionalData.

protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
    // canonical
    if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentHtmlCanonicalXpath())) {
        final String canonicalUrl = getCanonicalUrl(responseData, document);
        if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl())) {
            final Set<RequestData> childUrlSet = new HashSet<>();
            childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
            throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData(Map<String, Object>, ResponseData, Document)");
        }
    }
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
    final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
    final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
    String url = responseData.getUrl();
    final String indexingTarget = crawlingConfig.getIndexingTarget(url);
    url = pathMappingHelper.replaceUrl(sessionId, url);
    final String mimeType = responseData.getMimeType();
    final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
    final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
    String urlEncoding;
    final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
    if (urlQueue != null && urlQueue.getEncoding() != null) {
        urlEncoding = urlQueue.getEncoding();
    } else {
        urlEncoding = responseData.getCharSet();
    }
    // cid
    final String configId = crawlingConfig.getConfigId();
    if (configId != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
    }
    //  expires
    if (documentExpires != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // lang
    final String lang = systemHelper.normalizeLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
    if (lang != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
    }
    // title
    // content
    final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
    putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(responseData, body, dataMap));
    if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
        if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
            String charSet = responseData.getCharSet();
            if (charSet == null) {
                charSet = Constants.UTF_8;
            }
            try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
                // cache
                putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
                putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
            } catch (final Exception e) {
                logger.warn("Failed to write a cache: " + sessionId + ":" + responseData, e);
            }
        } else {
            logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
        }
    }
    // digest
    final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
    if (StringUtil.isNotBlank(digest)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
    } else {
        putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
    }
    // segment
    putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
    // host
    putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
    // site
    putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
    // filename
    final String fileName = getFileName(url, urlEncoding);
    if (StringUtil.isNotBlank(fileName)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
    }
    // url
    putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    // created
    final Date now = systemHelper.getCurrentTime();
    putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
    // anchor
    putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
    // mimetype
    putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
    if (fileTypeHelper != null) {
        // filetype
        putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
    }
    // content_length
    putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
    // last_modified
    final Date lastModified = responseData.getLastModified();
    if (lastModified != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
    } else {
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
    }
    // indexingTarget
    putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
    //  boost
    putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
    // label: labelType
    final Set<String> labelTypeSet = new HashSet<>();
    for (final String labelType : crawlingConfig.getLabelTypeValues()) {
        labelTypeSet.add(labelType);
    }
    labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
    putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
    // role: roleType
    final List<String> roleTypeList = new ArrayList<>();
    stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
    // id
    putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    // parentId
    String parentUrl = responseData.getParentUrl();
    if (StringUtil.isNotBlank(parentUrl)) {
        parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
        // set again
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    }
    // thumbnail
    final String thumbnailUrl = getThumbnailUrl(responseData, document);
    if (StringUtil.isNotBlank(thumbnailUrl)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
    }
    // from config
    final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
    xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
        final String key = e.getKey();
        final String value = getSingleNodeValue(document, e.getValue(), true);
        putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
    });
    crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
        final String key = e.getKey();
        final String value = e.getValue();
        putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key));
    });
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) BufferedInputStream(java.io.BufferedInputStream) URL(java.net.URL) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Locale(java.util.Locale) DOMParser(org.cyberneko.html.parsers.DOMParser) Document(org.w3c.dom.Document) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) InputStreamUtil(org.codelibs.core.io.InputStreamUtil) ResultData(org.codelibs.fess.crawler.entity.ResultData) Set(java.util.Set) List(java.util.List) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) PostConstruct(javax.annotation.PostConstruct) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) XObject(org.apache.xpath.objects.XObject) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) TransformerException(javax.xml.transform.TransformerException) HashMap(java.util.HashMap) SerializeUtil(org.codelibs.core.io.SerializeUtil) CrawlingParameterUtil(org.codelibs.fess.crawler.util.CrawlingParameterUtil) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) XpathTransformer(org.codelibs.fess.crawler.transformer.impl.XpathTransformer) Node(org.w3c.dom.Node) PrunedTag(org.codelibs.fess.util.PrunedTag) NamedNodeMap(org.w3c.dom.NamedNodeMap) InputSource(org.xml.sax.InputSource) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) NodeList(org.w3c.dom.NodeList) MalformedURLException(java.net.MalformedURLException) StringUtil(org.codelibs.core.lang.StringUtil) DuplicateHostHelper(org.codelibs.fess.helper.DuplicateHostHelper) ValueHolder(org.codelibs.core.misc.ValueHolder) SystemHelper(org.codelibs.fess.helper.SystemHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Collections(java.util.Collections) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) TransformerException(javax.xml.transform.TransformerException) MalformedURLException(java.net.MalformedURLException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) BufferedInputStream(java.io.BufferedInputStream) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) HashSet(java.util.HashSet)

Example 2 with DocumentHelper

use of org.codelibs.fess.helper.DocumentHelper in project fess by codelibs.

the class AbstractFessFileTransformer method generateData.

protected Map<String, Object> generateData(final ResponseData responseData) {
    final Extractor extractor = getExtractor(responseData);
    final Map<String, String> params = new HashMap<>();
    params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
    final String mimeType = responseData.getMimeType();
    params.put(HttpHeaders.CONTENT_TYPE, mimeType);
    params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
    final StringBuilder contentMetaBuf = new StringBuilder(1000);
    final Map<String, Object> dataMap = new HashMap<>();
    final Map<String, Object> metaDataMap = new HashMap<>();
    String content;
    try (final InputStream in = responseData.getResponseBody()) {
        final ExtractData extractData = getExtractData(extractor, in, params);
        content = extractData.getContent();
        if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
            return null;
        }
        if (getLogger().isDebugEnabled()) {
            getLogger().debug("ExtractData: " + extractData);
        }
        // meta
        //
        extractData.getKeySet().stream().filter(//
        k -> extractData.getValues(k) != null).forEach(key -> {
            final String[] values = extractData.getValues(key);
            metaDataMap.put(key, values);
            if (fessConfig.isCrawlerMetadataContentIncluded(key)) {
                final String joinedValue = StringUtils.join(values, ' ');
                if (StringUtil.isNotBlank(joinedValue)) {
                    if (contentMetaBuf.length() > 0) {
                        contentMetaBuf.append(' ');
                    }
                    contentMetaBuf.append(joinedValue.trim());
                }
            }
            final Pair<String, String> mapping = fessConfig.getCrawlerMetadataNameMapping(key);
            if (mapping != null) {
                if (Constants.MAPPING_TYPE_ARRAY.equalsIgnoreCase(mapping.getSecond())) {
                    dataMap.put(mapping.getFirst(), values);
                } else if (Constants.MAPPING_TYPE_STRING.equalsIgnoreCase(mapping.getSecond())) {
                    final String joinedValue = StringUtils.join(values, ' ');
                    dataMap.put(mapping.getFirst(), joinedValue.trim());
                } else if (values.length == 1) {
                    try {
                        if (Constants.MAPPING_TYPE_LONG.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Long.parseLong(values[0]));
                        } else if (Constants.MAPPING_TYPE_DOUBLE.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Double.parseDouble(values[0]));
                        } else {
                            logger.warn("Unknown mapping type: {}={}", key, mapping);
                        }
                    } catch (final NumberFormatException e) {
                        logger.warn("Failed to parse " + values[0], e);
                    }
                }
            }
        });
    } catch (final Exception e) {
        final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
        rcae.setLogLevel(CrawlingAccessException.WARN);
        throw rcae;
    }
    if (content == null) {
        content = StringUtil.EMPTY;
    }
    final String contentMeta = contentMetaBuf.toString().trim();
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
    final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
    String url = responseData.getUrl();
    final String indexingTarget = crawlingConfig.getIndexingTarget(url);
    url = pathMappingHelper.replaceUrl(sessionId, url);
    final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
    String urlEncoding;
    final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
    if (urlQueue != null && urlQueue.getEncoding() != null) {
        urlEncoding = urlQueue.getEncoding();
    } else {
        urlEncoding = responseData.getCharSet();
    }
    // cid
    final String configId = crawlingConfig.getConfigId();
    if (configId != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
    }
    //  expires
    if (documentExpires != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // segment
    putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
    // content
    final StringBuilder buf = new StringBuilder(content.length() + 1000);
    if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) {
        buf.append(content);
    }
    if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) {
        if (buf.length() > 0) {
            buf.append(' ');
        }
        buf.append(contentMeta);
    }
    final String bodyBase = buf.toString().trim();
    final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
    putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
    if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
        if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
            final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
            // text cache
            putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
            putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
        }
    }
    // digest
    putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
    // title
    final String fileName = getFileName(url, urlEncoding);
    if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
        if (url.endsWith("/")) {
            if (StringUtil.isNotBlank(content)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
            }
        } else {
            if (StringUtil.isBlank(fileName)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
            }
        }
    }
    // host
    putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url));
    // site
    putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
    // filename
    if (StringUtil.isNotBlank(fileName)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
    }
    // url
    putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    // created
    final Date now = systemHelper.getCurrentTime();
    putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
    // TODO anchor
    putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), StringUtil.EMPTY);
    // mimetype
    putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
    if (fileTypeHelper != null) {
        // filetype
        putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
    }
    // content_length
    putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
    // last_modified
    final Date lastModified = responseData.getLastModified();
    if (lastModified != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
    } else {
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
    }
    // indexingTarget
    putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
    //  boost
    putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
    // label: labelType
    final Set<String> labelTypeSet = new HashSet<>();
    for (final String labelType : crawlingConfig.getLabelTypeValues()) {
        labelTypeSet.add(labelType);
    }
    final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
    labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
    putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
    // role: roleType
    final List<String> roleTypeList = getRoleTypes(responseData);
    stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
    // lang
    if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang());
    }
    // id
    putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    // parentId
    String parentUrl = responseData.getParentUrl();
    if (StringUtil.isNotBlank(parentUrl)) {
        parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
        // set again
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    }
    // from config
    final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
    final Map<String, String> metaConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.META);
    for (final Map.Entry<String, String> entry : metaConfigMap.entrySet()) {
        final String key = entry.getKey();
        final String[] values = entry.getValue().split(",");
        for (final String value : values) {
            putResultDataWithTemplate(dataMap, key, metaDataMap.get(value), scriptConfigMap.get(key));
        }
    }
    final Map<String, String> valueConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.VALUE);
    for (final Map.Entry<String, String> entry : valueConfigMap.entrySet()) {
        final String key = entry.getKey();
        putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key));
    }
    return dataMap;
}
Also used : CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) URLDecoder(java.net.URLDecoder) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) Pair(org.codelibs.core.misc.Pair) HashMap(java.util.HashMap) TikaMetadataKeys(org.apache.tika.metadata.TikaMetadataKeys) SerializeUtil(org.codelibs.core.io.SerializeUtil) CrawlingParameterUtil(org.codelibs.fess.crawler.util.CrawlingParameterUtil) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) StringUtils(org.apache.commons.lang3.StringUtils) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) AbstractTransformer(org.codelibs.fess.crawler.transformer.impl.AbstractTransformer) SambaHelper(org.codelibs.fess.helper.SambaHelper) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) ResultData(org.codelibs.fess.crawler.entity.ResultData) Extractor(org.codelibs.fess.crawler.extractor.Extractor) StringUtil(org.codelibs.core.lang.StringUtil) SID(jcifs.smb.SID) Set(java.util.Set) List(java.util.List) ACE(jcifs.smb.ACE) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) HttpHeaders(org.apache.tika.metadata.HttpHeaders) SmbClient(org.codelibs.fess.crawler.client.smb.SmbClient) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) HashMap(java.util.HashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) Extractor(org.codelibs.fess.crawler.extractor.Extractor) HashSet(java.util.HashSet) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) InputStream(java.io.InputStream) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Date(java.util.Date) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

ArrayList (java.util.ArrayList)2 Date (java.util.Date)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 SerializeUtil (org.codelibs.core.io.SerializeUtil)2 StringUtil (org.codelibs.core.lang.StringUtil)2 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)2 Constants (org.codelibs.fess.Constants)2 AccessResultData (org.codelibs.fess.crawler.entity.AccessResultData)2 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)2 ResultData (org.codelibs.fess.crawler.entity.ResultData)2 UrlQueue (org.codelibs.fess.crawler.entity.UrlQueue)2 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)2 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)2 CrawlingParameterUtil (org.codelibs.fess.crawler.util.CrawlingParameterUtil)2 CrawlingConfig (org.codelibs.fess.es.config.exentity.CrawlingConfig)2 ConfigName (org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName)2