Search in sources :

Example 6 with CrawlingInfoHelper

use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.

the class AbstractFessFileTransformer method generateData.

protected Map<String, Object> generateData(final ResponseData responseData) {
    final Extractor extractor = getExtractor(responseData);
    final Map<String, String> params = new HashMap<>();
    params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
    final String mimeType = responseData.getMimeType();
    params.put(HttpHeaders.CONTENT_TYPE, mimeType);
    params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
    final StringBuilder contentMetaBuf = new StringBuilder(1000);
    final Map<String, Object> dataMap = new HashMap<>();
    final Map<String, Object> metaDataMap = new HashMap<>();
    String content;
    try (final InputStream in = responseData.getResponseBody()) {
        final ExtractData extractData = getExtractData(extractor, in, params);
        content = extractData.getContent();
        if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
            return null;
        }
        if (getLogger().isDebugEnabled()) {
            getLogger().debug("ExtractData: " + extractData);
        }
        // meta
        //
        extractData.getKeySet().stream().filter(//
        k -> extractData.getValues(k) != null).forEach(key -> {
            final String[] values = extractData.getValues(key);
            metaDataMap.put(key, values);
            if (fessConfig.isCrawlerMetadataContentIncluded(key)) {
                final String joinedValue = StringUtils.join(values, ' ');
                if (StringUtil.isNotBlank(joinedValue)) {
                    if (contentMetaBuf.length() > 0) {
                        contentMetaBuf.append(' ');
                    }
                    contentMetaBuf.append(joinedValue.trim());
                }
            }
            final Pair<String, String> mapping = fessConfig.getCrawlerMetadataNameMapping(key);
            if (mapping != null) {
                if (Constants.MAPPING_TYPE_ARRAY.equalsIgnoreCase(mapping.getSecond())) {
                    dataMap.put(mapping.getFirst(), values);
                } else if (Constants.MAPPING_TYPE_STRING.equalsIgnoreCase(mapping.getSecond())) {
                    final String joinedValue = StringUtils.join(values, ' ');
                    dataMap.put(mapping.getFirst(), joinedValue.trim());
                } else if (values.length == 1) {
                    try {
                        if (Constants.MAPPING_TYPE_LONG.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Long.parseLong(values[0]));
                        } else if (Constants.MAPPING_TYPE_DOUBLE.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Double.parseDouble(values[0]));
                        } else {
                            logger.warn("Unknown mapping type: {}={}", key, mapping);
                        }
                    } catch (final NumberFormatException e) {
                        logger.warn("Failed to parse " + values[0], e);
                    }
                }
            }
        });
    } catch (final Exception e) {
        final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
        rcae.setLogLevel(CrawlingAccessException.WARN);
        throw rcae;
    }
    if (content == null) {
        content = StringUtil.EMPTY;
    }
    final String contentMeta = contentMetaBuf.toString().trim();
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
    final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
    String url = responseData.getUrl();
    final String indexingTarget = crawlingConfig.getIndexingTarget(url);
    url = pathMappingHelper.replaceUrl(sessionId, url);
    final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
    String urlEncoding;
    final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
    if (urlQueue != null && urlQueue.getEncoding() != null) {
        urlEncoding = urlQueue.getEncoding();
    } else {
        urlEncoding = responseData.getCharSet();
    }
    // cid
    final String configId = crawlingConfig.getConfigId();
    if (configId != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
    }
    //  expires
    if (documentExpires != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // segment
    putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
    // content
    final StringBuilder buf = new StringBuilder(content.length() + 1000);
    if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) {
        buf.append(content);
    }
    if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) {
        if (buf.length() > 0) {
            buf.append(' ');
        }
        buf.append(contentMeta);
    }
    final String bodyBase = buf.toString().trim();
    final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
    putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
    if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
        if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
            final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
            // text cache
            putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
            putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
        }
    }
    // digest
    putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
    // title
    final String fileName = getFileName(url, urlEncoding);
    if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
        if (url.endsWith("/")) {
            if (StringUtil.isNotBlank(content)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
            }
        } else {
            if (StringUtil.isBlank(fileName)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
            }
        }
    }
    // host
    putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url));
    // site
    putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
    // filename
    if (StringUtil.isNotBlank(fileName)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
    }
    // url
    putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    // created
    final Date now = systemHelper.getCurrentTime();
    putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
    // TODO anchor
    putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), StringUtil.EMPTY);
    // mimetype
    putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
    if (fileTypeHelper != null) {
        // filetype
        putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
    }
    // content_length
    putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
    // last_modified
    final Date lastModified = responseData.getLastModified();
    if (lastModified != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
    } else {
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
    }
    // indexingTarget
    putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
    //  boost
    putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
    // label: labelType
    final Set<String> labelTypeSet = new HashSet<>();
    for (final String labelType : crawlingConfig.getLabelTypeValues()) {
        labelTypeSet.add(labelType);
    }
    final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
    labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
    putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
    // role: roleType
    final List<String> roleTypeList = getRoleTypes(responseData);
    stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
    // lang
    if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang());
    }
    // id
    putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    // parentId
    String parentUrl = responseData.getParentUrl();
    if (StringUtil.isNotBlank(parentUrl)) {
        parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
        // set again
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    }
    // from config
    final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
    final Map<String, String> metaConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.META);
    for (final Map.Entry<String, String> entry : metaConfigMap.entrySet()) {
        final String key = entry.getKey();
        final String[] values = entry.getValue().split(",");
        for (final String value : values) {
            putResultDataWithTemplate(dataMap, key, metaDataMap.get(value), scriptConfigMap.get(key));
        }
    }
    final Map<String, String> valueConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.VALUE);
    for (final Map.Entry<String, String> entry : valueConfigMap.entrySet()) {
        final String key = entry.getKey();
        putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key));
    }
    return dataMap;
}
Also used : CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) URLDecoder(java.net.URLDecoder) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) Pair(org.codelibs.core.misc.Pair) HashMap(java.util.HashMap) TikaMetadataKeys(org.apache.tika.metadata.TikaMetadataKeys) SerializeUtil(org.codelibs.core.io.SerializeUtil) CrawlingParameterUtil(org.codelibs.fess.crawler.util.CrawlingParameterUtil) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) StringUtils(org.apache.commons.lang3.StringUtils) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) AbstractTransformer(org.codelibs.fess.crawler.transformer.impl.AbstractTransformer) SambaHelper(org.codelibs.fess.helper.SambaHelper) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) ResultData(org.codelibs.fess.crawler.entity.ResultData) Extractor(org.codelibs.fess.crawler.extractor.Extractor) StringUtil(org.codelibs.core.lang.StringUtil) SID(jcifs.smb.SID) Set(java.util.Set) List(java.util.List) ACE(jcifs.smb.ACE) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) HttpHeaders(org.apache.tika.metadata.HttpHeaders) SmbClient(org.codelibs.fess.crawler.client.smb.SmbClient) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) HashMap(java.util.HashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) Extractor(org.codelibs.fess.crawler.extractor.Extractor) HashSet(java.util.HashSet) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) InputStream(java.io.InputStream) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Date(java.util.Date) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) HashMap(java.util.HashMap) Map(java.util.Map)

Example 7 with CrawlingInfoHelper

use of org.codelibs.fess.helper.CrawlingInfoHelper in project fess by codelibs.

the class Crawler method process.

private static int process(final Options options) {
    final Crawler crawler = ComponentUtil.getComponent(Crawler.class);
    if (StringUtil.isBlank(options.sessionId)) {
        // use a default session id
        final SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");
        options.sessionId = sdf.format(new Date());
    }
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final DynamicProperties systemProperties = ComponentUtil.getSystemProperties();
    if (StringUtil.isNotBlank(options.propertiesPath)) {
        systemProperties.reload(options.propertiesPath);
    } else {
        try {
            final File propFile = File.createTempFile("crawler_", ".properties");
            if (propFile.delete() && logger.isDebugEnabled()) {
                logger.debug("Deleted a temp file: " + propFile.getAbsolutePath());
            }
            systemProperties.reload(propFile.getAbsolutePath());
            propFile.deleteOnExit();
        } catch (final IOException e) {
            logger.warn("Failed to create system properties file.", e);
        }
    }
    try {
        crawlingInfoHelper.store(options.sessionId, true);
        final String dayForCleanupStr;
        int dayForCleanup = -1;
        if (StringUtil.isNotBlank(options.expires)) {
            dayForCleanupStr = options.expires;
            try {
                dayForCleanup = Integer.parseInt(dayForCleanupStr);
            } catch (final NumberFormatException e) {
            }
        } else {
            dayForCleanup = ComponentUtil.getFessConfig().getDayForCleanup();
        }
        crawlingInfoHelper.updateParams(options.sessionId, options.name, dayForCleanup);
    } catch (final Exception e) {
        logger.warn("Failed to store crawling information.", e);
    }
    try {
        return crawler.doCrawl(options);
    } finally {
        try {
            crawlingInfoHelper.store(options.sessionId, false);
        } catch (final Exception e) {
            logger.warn("Failed to store crawling information.", e);
        }
        final Map<String, String> infoMap = crawlingInfoHelper.getInfoMap(options.sessionId);
        final StringBuilder buf = new StringBuilder(500);
        for (final Map.Entry<String, String> entry : infoMap.entrySet()) {
            if (buf.length() != 0) {
                buf.append(',');
            }
            buf.append(entry.getKey()).append('=').append(entry.getValue());
        }
        logger.info("[CRAWL INFO] " + buf.toString());
        // notification
        try {
            crawler.sendMail(infoMap);
        } catch (final Exception e) {
            logger.warn("Failed to send a mail.", e);
        }
    }
}
Also used : DynamicProperties(org.codelibs.core.misc.DynamicProperties) IOException(java.io.IOException) Date(java.util.Date) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) IOException(java.io.IOException) CmdLineException(org.kohsuke.args4j.CmdLineException) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) SimpleDateFormat(java.text.SimpleDateFormat) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

CrawlingInfoHelper (org.codelibs.fess.helper.CrawlingInfoHelper)7 ArrayList (java.util.ArrayList)5 Date (java.util.Date)5 HashMap (java.util.HashMap)5 Map (java.util.Map)5 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)5 List (java.util.List)4 StringUtil (org.codelibs.core.lang.StringUtil)4 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)4 SystemHelper (org.codelibs.fess.helper.SystemHelper)4 ComponentUtil (org.codelibs.fess.util.ComponentUtil)4 Logger (org.slf4j.Logger)4 LoggerFactory (org.slf4j.LoggerFactory)4 HashSet (java.util.HashSet)3 Set (java.util.Set)3 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)3 UrlQueue (org.codelibs.fess.crawler.entity.UrlQueue)3 IOException (java.io.IOException)2 ACE (jcifs.smb.ACE)2 SID (jcifs.smb.SID)2