Search in sources :

Example 6 with CrawlingConfig

use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.

the class AbstractFessFileTransformer method generateData.

protected Map<String, Object> generateData(final ResponseData responseData) {
    final Extractor extractor = getExtractor(responseData);
    final Map<String, String> params = new HashMap<>();
    params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
    final String mimeType = responseData.getMimeType();
    params.put(HttpHeaders.CONTENT_TYPE, mimeType);
    params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
    final StringBuilder contentMetaBuf = new StringBuilder(1000);
    final Map<String, Object> dataMap = new HashMap<>();
    final Map<String, Object> metaDataMap = new HashMap<>();
    String content;
    try (final InputStream in = responseData.getResponseBody()) {
        final ExtractData extractData = getExtractData(extractor, in, params);
        content = extractData.getContent();
        if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
            return null;
        }
        if (getLogger().isDebugEnabled()) {
            getLogger().debug("ExtractData: " + extractData);
        }
        // meta
        //
        extractData.getKeySet().stream().filter(//
        k -> extractData.getValues(k) != null).forEach(key -> {
            final String[] values = extractData.getValues(key);
            metaDataMap.put(key, values);
            if (fessConfig.isCrawlerMetadataContentIncluded(key)) {
                final String joinedValue = StringUtils.join(values, ' ');
                if (StringUtil.isNotBlank(joinedValue)) {
                    if (contentMetaBuf.length() > 0) {
                        contentMetaBuf.append(' ');
                    }
                    contentMetaBuf.append(joinedValue.trim());
                }
            }
            final Pair<String, String> mapping = fessConfig.getCrawlerMetadataNameMapping(key);
            if (mapping != null) {
                if (Constants.MAPPING_TYPE_ARRAY.equalsIgnoreCase(mapping.getSecond())) {
                    dataMap.put(mapping.getFirst(), values);
                } else if (Constants.MAPPING_TYPE_STRING.equalsIgnoreCase(mapping.getSecond())) {
                    final String joinedValue = StringUtils.join(values, ' ');
                    dataMap.put(mapping.getFirst(), joinedValue.trim());
                } else if (values.length == 1) {
                    try {
                        if (Constants.MAPPING_TYPE_LONG.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Long.parseLong(values[0]));
                        } else if (Constants.MAPPING_TYPE_DOUBLE.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Double.parseDouble(values[0]));
                        } else {
                            logger.warn("Unknown mapping type: {}={}", key, mapping);
                        }
                    } catch (final NumberFormatException e) {
                        logger.warn("Failed to parse " + values[0], e);
                    }
                }
            }
        });
    } catch (final Exception e) {
        final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
        rcae.setLogLevel(CrawlingAccessException.WARN);
        throw rcae;
    }
    if (content == null) {
        content = StringUtil.EMPTY;
    }
    final String contentMeta = contentMetaBuf.toString().trim();
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
    final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
    String url = responseData.getUrl();
    final String indexingTarget = crawlingConfig.getIndexingTarget(url);
    url = pathMappingHelper.replaceUrl(sessionId, url);
    final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
    String urlEncoding;
    final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
    if (urlQueue != null && urlQueue.getEncoding() != null) {
        urlEncoding = urlQueue.getEncoding();
    } else {
        urlEncoding = responseData.getCharSet();
    }
    // cid
    final String configId = crawlingConfig.getConfigId();
    if (configId != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
    }
    //  expires
    if (documentExpires != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // segment
    putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
    // content
    final StringBuilder buf = new StringBuilder(content.length() + 1000);
    if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) {
        buf.append(content);
    }
    if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) {
        if (buf.length() > 0) {
            buf.append(' ');
        }
        buf.append(contentMeta);
    }
    final String bodyBase = buf.toString().trim();
    final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
    putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
    if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
        if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
            final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
            // text cache
            putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
            putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
        }
    }
    // digest
    putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
    // title
    final String fileName = getFileName(url, urlEncoding);
    if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
        if (url.endsWith("/")) {
            if (StringUtil.isNotBlank(content)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
            }
        } else {
            if (StringUtil.isBlank(fileName)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
            }
        }
    }
    // host
    putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url));
    // site
    putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
    // filename
    if (StringUtil.isNotBlank(fileName)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
    }
    // url
    putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    // created
    final Date now = systemHelper.getCurrentTime();
    putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
    // TODO anchor
    putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), StringUtil.EMPTY);
    // mimetype
    putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
    if (fileTypeHelper != null) {
        // filetype
        putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
    }
    // content_length
    putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
    // last_modified
    final Date lastModified = responseData.getLastModified();
    if (lastModified != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
    } else {
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
    }
    // indexingTarget
    putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
    //  boost
    putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
    // label: labelType
    final Set<String> labelTypeSet = new HashSet<>();
    for (final String labelType : crawlingConfig.getLabelTypeValues()) {
        labelTypeSet.add(labelType);
    }
    final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
    labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
    putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
    // role: roleType
    final List<String> roleTypeList = getRoleTypes(responseData);
    stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
    // lang
    if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang());
    }
    // id
    putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    // parentId
    String parentUrl = responseData.getParentUrl();
    if (StringUtil.isNotBlank(parentUrl)) {
        parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
        // set again
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    }
    // from config
    final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
    final Map<String, String> metaConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.META);
    for (final Map.Entry<String, String> entry : metaConfigMap.entrySet()) {
        final String key = entry.getKey();
        final String[] values = entry.getValue().split(",");
        for (final String value : values) {
            putResultDataWithTemplate(dataMap, key, metaDataMap.get(value), scriptConfigMap.get(key));
        }
    }
    final Map<String, String> valueConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.VALUE);
    for (final Map.Entry<String, String> entry : valueConfigMap.entrySet()) {
        final String key = entry.getKey();
        putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key));
    }
    return dataMap;
}
Also used : CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) URLDecoder(java.net.URLDecoder) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) Pair(org.codelibs.core.misc.Pair) HashMap(java.util.HashMap) TikaMetadataKeys(org.apache.tika.metadata.TikaMetadataKeys) SerializeUtil(org.codelibs.core.io.SerializeUtil) CrawlingParameterUtil(org.codelibs.fess.crawler.util.CrawlingParameterUtil) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) StringUtils(org.apache.commons.lang3.StringUtils) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) AbstractTransformer(org.codelibs.fess.crawler.transformer.impl.AbstractTransformer) SambaHelper(org.codelibs.fess.helper.SambaHelper) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) ResultData(org.codelibs.fess.crawler.entity.ResultData) Extractor(org.codelibs.fess.crawler.extractor.Extractor) StringUtil(org.codelibs.core.lang.StringUtil) SID(jcifs.smb.SID) Set(java.util.Set) List(java.util.List) ACE(jcifs.smb.ACE) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) HttpHeaders(org.apache.tika.metadata.HttpHeaders) SmbClient(org.codelibs.fess.crawler.client.smb.SmbClient) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) HashMap(java.util.HashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) Extractor(org.codelibs.fess.crawler.extractor.Extractor) HashSet(java.util.HashSet) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) InputStream(java.io.InputStream) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Date(java.util.Date) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) HashMap(java.util.HashMap) Map(java.util.Map)

Example 7 with CrawlingConfig

use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.

the class FailureUrlService method store.

public void store(final CrawlingConfig crawlingConfig, final String errorName, final String url, final Throwable e) {
    if (e instanceof ContainerNotAvailableException) {
        return;
    }
    final FailureUrlBhv bhv = ComponentUtil.getComponent(FailureUrlBhv.class);
    final FailureUrl failureUrl = bhv.selectEntity(cb -> {
        cb.query().setUrl_Equal(url);
        if (crawlingConfig != null) {
            cb.query().setConfigId_Equal(crawlingConfig.getConfigId());
        }
    }).map(entity -> {
        entity.setErrorCount(entity.getErrorCount() + 1);
        return entity;
    }).orElseGet(() -> {
        final FailureUrl entity = new FailureUrl();
        entity.setErrorCount(1);
        entity.setUrl(url);
        if (crawlingConfig != null) {
            entity.setConfigId(crawlingConfig.getConfigId());
        }
        return entity;
    });
    failureUrl.setErrorName(errorName);
    failureUrl.setErrorLog(StringUtils.abbreviate(getStackTrace(e), 4000));
    failureUrl.setLastAccessTime(ComponentUtil.getSystemHelper().getCurrentTimeAsLong());
    failureUrl.setThreadName(Thread.currentThread().getName());
    bhv.insertOrUpdate(failureUrl, op -> {
        op.setRefreshPolicy(Constants.TRUE);
    });
}
Also used : ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) PrintWriter(java.io.PrintWriter) BeanUtil(org.codelibs.core.beans.util.BeanUtil) Constants(org.codelibs.fess.Constants) FailureUrlBhv(org.codelibs.fess.es.config.exbhv.FailureUrlBhv) ListResultBean(org.dbflute.cbean.result.ListResultBean) OptionalEntity(org.dbflute.optional.OptionalEntity) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) Resource(javax.annotation.Resource) StringUtil(org.codelibs.core.lang.StringUtil) StringUtils(org.apache.commons.lang3.StringUtils) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) List(java.util.List) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) FailureUrlCB(org.codelibs.fess.es.config.cbean.FailureUrlCB) PagingResultBean(org.dbflute.cbean.result.PagingResultBean) ComponentUtil(org.codelibs.fess.util.ComponentUtil) SystemHelper(org.codelibs.fess.helper.SystemHelper) StringBuilderWriter(org.apache.commons.io.output.StringBuilderWriter) FailureUrlPager(org.codelibs.fess.app.pager.FailureUrlPager) Pattern(java.util.regex.Pattern) Collections(java.util.Collections) FailureUrl(org.codelibs.fess.es.config.exentity.FailureUrl) FailureUrlBhv(org.codelibs.fess.es.config.exbhv.FailureUrlBhv) FailureUrl(org.codelibs.fess.es.config.exentity.FailureUrl)

Example 8 with CrawlingConfig

use of org.codelibs.fess.es.config.exentity.CrawlingConfig in project fess by codelibs.

the class ViewHelper method asContentResponse.

public StreamResponse asContentResponse(final Map<String, Object> doc) {
    if (logger.isDebugEnabled()) {
        logger.debug("writing the content of: " + doc);
    }
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final String configId = DocumentUtil.getValue(doc, fessConfig.getIndexFieldConfigId(), String.class);
    if (configId == null) {
        throw new FessSystemException("configId is null.");
    }
    if (configId.length() < 2) {
        throw new FessSystemException("Invalid configId: " + configId);
    }
    final ConfigType configType = crawlingConfigHelper.getConfigType(configId);
    CrawlingConfig config = null;
    if (logger.isDebugEnabled()) {
        logger.debug("configType: " + configType + ", configId: " + configId);
    }
    if (ConfigType.WEB == configType) {
        final WebConfigService webConfigService = ComponentUtil.getComponent(WebConfigService.class);
        config = webConfigService.getWebConfig(crawlingConfigHelper.getId(configId)).get();
    } else if (ConfigType.FILE == configType) {
        final FileConfigService fileConfigService = ComponentUtil.getComponent(FileConfigService.class);
        config = fileConfigService.getFileConfig(crawlingConfigHelper.getId(configId)).get();
    } else if (ConfigType.DATA == configType) {
        final DataConfigService dataConfigService = ComponentUtil.getComponent(DataConfigService.class);
        config = dataConfigService.getDataConfig(crawlingConfigHelper.getId(configId)).get();
    }
    if (config == null) {
        throw new FessSystemException("No crawlingConfig: " + configId);
    }
    final String url = DocumentUtil.getValue(doc, fessConfig.getIndexFieldUrl(), String.class);
    final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getComponent(CrawlerClientFactory.class);
    config.initializeClientFactory(crawlerClientFactory);
    final CrawlerClient client = crawlerClientFactory.getClient(url);
    if (client == null) {
        throw new FessSystemException("No CrawlerClient: " + configId + ", url: " + url);
    }
    return writeContent(configId, url, client);
}
Also used : DataConfigService(org.codelibs.fess.app.service.DataConfigService) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) WebConfigService(org.codelibs.fess.app.service.WebConfigService) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) ConfigType(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigType) FessSystemException(org.codelibs.fess.exception.FessSystemException) FileConfigService(org.codelibs.fess.app.service.FileConfigService)

Aggregations

CrawlingConfig (org.codelibs.fess.es.config.exentity.CrawlingConfig)8 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)6 ArrayList (java.util.ArrayList)5 HashMap (java.util.HashMap)4 List (java.util.List)4 Map (java.util.Map)4 StringUtil (org.codelibs.core.lang.StringUtil)4 Date (java.util.Date)3 HashSet (java.util.HashSet)3 Set (java.util.Set)3 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)3 Constants (org.codelibs.fess.Constants)3 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)3 UrlQueue (org.codelibs.fess.crawler.entity.UrlQueue)3 CrawlingConfigHelper (org.codelibs.fess.helper.CrawlingConfigHelper)3 CrawlingInfoHelper (org.codelibs.fess.helper.CrawlingInfoHelper)3 ComponentUtil (org.codelibs.fess.util.ComponentUtil)3 Collections (java.util.Collections)2 ACE (jcifs.smb.ACE)2 SID (jcifs.smb.SID)2