use of org.codelibs.fess.crawler.entity.UrlQueue in project fess-crawler by codelibs.
the class HostIntervalControllerTest method test_delayBeforeProcessing.
/**
* 同一ホストに対するクローリングのインターバルが正しく動作すること
*/
public void test_delayBeforeProcessing() {
// 同時実行数
final int numTasks = 100;
// インターバル
final Long waittime = 100L;
CrawlingParameterUtil.setUrlQueue(new UrlQueueImpl());
final UrlQueue q = CrawlingParameterUtil.getUrlQueue();
for (int i = 0; i < numTasks; i++) {
q.setUrl("http://example.com");
}
final HostIntervalController controller = new HostIntervalController();
controller.delayMillisBeforeProcessing = waittime;
controller.delayMillisAfterProcessing = 0L;
controller.delayMillisForWaitingNewUrl = 0L;
controller.delayMillisAtNoUrlInQueue = 0L;
final Callable<Integer> testCallable = new Callable<Integer>() {
public Integer call() throws Exception {
CrawlingParameterUtil.setUrlQueue(q);
controller.delayBeforeProcessing();
return 0;
}
};
// Callableタスクを複数生成
final List<Callable<Integer>> tasks = new ArrayList<Callable<Integer>>();
for (int i = 0; i < numTasks; i++) {
tasks.add(testCallable);
}
// 時間取得
final long time = System.nanoTime();
// Callableタスク(複数)を実行する
final ExecutorService executor = Executors.newFixedThreadPool(numTasks);
try {
final List<Future<Integer>> futures = executor.invokeAll(tasks);
for (final Future<Integer> future : futures) {
future.get();
}
} catch (final InterruptedException e) {
// no thing to do
} catch (final ExecutionException e) {
// no thing to do
}
long elapsed = (System.nanoTime() - time) / 1000000;
long wait = waittime * (numTasks - 1);
assertTrue(elapsed + " >= " + wait, elapsed >= wait);
}
use of org.codelibs.fess.crawler.entity.UrlQueue in project fess-crawler by codelibs.
the class DefaultResponseProcessor method process.
@Override
public void process(final ResponseData responseData) {
if (isNotModified(responseData)) {
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
final ResultData resultData = new ResultData();
resultData.setData(new byte[0]);
resultData.setEncoding(Constants.UTF_8);
resultData.setTransformerName(Constants.NO_TRANSFORMER);
processResult(urlQueue, responseData, resultData);
} else if (isSuccessful(responseData)) {
if (transformer == null) {
if (logger.isDebugEnabled()) {
logger.debug("No Transformer for (" + responseData.getUrl() + "). PLEASE CHECK YOUR CONFIGURATION.");
}
} else {
final ResultData resultData = transformer.transform(responseData);
if (resultData == null) {
logger.warn("No data for (" + responseData.getUrl() + ", " + responseData.getMimeType() + ")");
} else {
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
processResult(urlQueue, responseData, resultData);
}
}
} else if (logger.isDebugEnabled()) {
logger.debug("Ignore a response(" + responseData.getStatus() + "): " + responseData.getUrl());
}
}
use of org.codelibs.fess.crawler.entity.UrlQueue in project fess-crawler by codelibs.
the class DefaultResponseProcessor method processResult.
protected void processResult(final UrlQueue<?> urlQueue, final ResponseData responseData, final ResultData resultData) {
final AccessResult<?> accessResult = crawlerContainer.getComponent("accessResult");
accessResult.init(responseData, resultData);
final CrawlerContext crawlerContext = CrawlingParameterUtil.getCrawlerContext();
final UrlQueueService<UrlQueue<?>> urlQueueService = CrawlingParameterUtil.getUrlQueueService();
if (logger.isDebugEnabled()) {
logger.debug("Processing accessResult: " + accessResult);
}
if (!urlQueueService.visited(urlQueue)) {
if (checkAccessCount(crawlerContext)) {
if (logger.isDebugEnabled()) {
logger.debug("Storing accessResult: " + accessResult);
}
try {
// store
CrawlingParameterUtil.getDataService().store(accessResult);
} catch (final RuntimeException e) {
crawlerContext.decrementAndGetAccessCount();
if (urlQueueService.visited(urlQueue)) {
// document already exists
if (logger.isDebugEnabled()) {
logger.debug(urlQueue.getUrl() + " exists.", e);
}
return;
}
throw e;
}
if (logger.isDebugEnabled()) {
logger.debug("Storing child urls: " + resultData.getChildUrlSet());
}
// add and filter urls
storeChildUrls(crawlerContext, resultData.getChildUrlSet(), urlQueue.getUrl(), urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1, resultData.getEncoding());
} else if (crawlerContext.getMaxDepth() < 0 || urlQueue.getDepth() <= crawlerContext.getMaxDepth()) {
// cancel crawling
crawlerContext.decrementAndGetAccessCount();
final List<UrlQueue<?>> newUrlQueueList = new ArrayList<>();
newUrlQueueList.add(urlQueue);
urlQueueService.offerAll(crawlerContext.getSessionId(), newUrlQueueList);
}
}
}
use of org.codelibs.fess.crawler.entity.UrlQueue in project fess by codelibs.
the class FessXpathTransformer method putAdditionalData.
protected void putAdditionalData(final Map<String, Object> dataMap, final ResponseData responseData, final Document document) {
// canonical
final String canonicalUrl = getCanonicalUrl(responseData, document);
if (canonicalUrl != null && !canonicalUrl.equals(responseData.getUrl()) && isValidUrl(canonicalUrl) && isValidCanonicalUrl(responseData.getUrl(), canonicalUrl)) {
final Set<RequestData> childUrlSet = new HashSet<>();
childUrlSet.add(RequestDataBuilder.newRequestData().get().url(canonicalUrl).build());
logger.info("CANONICAL: {} -> {}", responseData.getUrl(), canonicalUrl);
throw new ChildUrlsException(childUrlSet, this.getClass().getName() + "#putAdditionalData");
}
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
final String mimeType = responseData.getMimeType();
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
final Map<String, String> xpathConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.XPATH);
String urlEncoding;
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
if (urlQueue != null && urlQueue.getEncoding() != null) {
urlEncoding = urlQueue.getEncoding();
} else {
urlEncoding = responseData.getCharSet();
}
// cid
final String configId = crawlingConfig.getConfigId();
if (configId != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
}
// expires
if (documentExpires != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
}
// lang
final String lang = systemHelper.normalizeHtmlLang(getSingleNodeValue(document, getLangXpath(fessConfig, xpathConfigMap), true));
if (lang != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), lang);
}
// title
// content
final String body = getSingleNodeValue(document, getContentXpath(fessConfig, xpathConfigMap), prunedContent);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), documentHelper.getContent(crawlingConfig, responseData, body, dataMap));
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
String charSet = responseData.getCharSet();
if (charSet == null) {
charSet = Constants.UTF_8;
}
try (final BufferedInputStream is = new BufferedInputStream(responseData.getResponseBody())) {
// cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), new String(InputStreamUtil.getBytes(is), charSet));
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
} catch (final Exception e) {
logger.warn("Failed to write a cache: {}:{}", sessionId, responseData, e);
}
} else {
logger.debug("Content size is too large({} > {}): {}", responseData.getContentLength(), fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger(), responseData.getUrl());
}
}
// digest
final String digest = getSingleNodeValue(document, getDigestXpath(fessConfig, xpathConfigMap), false);
if (StringUtil.isNotBlank(digest)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), digest);
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentHtmlMaxDigestLengthAsInteger()));
}
// segment
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
// host
putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHost(url));
// site
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSite(url, urlEncoding));
// filename
final String fileName = getFileName(url, urlEncoding);
if (StringUtil.isNotBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
}
// url
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
// created
final Date now = systemHelper.getCurrentTime();
putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
// anchor
putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), getAnchorList(document, responseData));
// mimetype
putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
if (fileTypeHelper != null) {
// filetype
putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
}
// content_length
putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
// last_modified
final Date lastModified = responseData.getLastModified();
if (lastModified != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
} else {
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
}
// indexingTarget
putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
// boost
putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
// label: labelType
putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeHelper.getMatchedLabelValueSet(url));
// role: roleType
final List<String> roleTypeList = new ArrayList<>();
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
// virtualHosts
putResultDataBody(dataMap, fessConfig.getIndexFieldVirtualHost(), stream(crawlingConfig.getVirtualHosts()).get(stream -> stream.filter(StringUtil::isNotBlank).collect(Collectors.toList())));
// id
putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
// parentId
String parentUrl = responseData.getParentUrl();
if (StringUtil.isNotBlank(parentUrl)) {
parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
// set again
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
}
// thumbnail
final String thumbnailUrl = getThumbnailUrl(responseData, document);
if (StringUtil.isNotBlank(thumbnailUrl)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldThumbnail(), thumbnailUrl);
}
// from config
final String scriptType = crawlingConfig.getScriptType();
final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
xpathConfigMap.entrySet().stream().filter(e -> !e.getKey().startsWith("default.")).forEach(e -> {
final String key = e.getKey();
final String value = getSingleNodeValue(document, e.getValue(), true);
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
});
crawlingConfig.getConfigParameterMap(ConfigName.VALUE).entrySet().stream().forEach(e -> {
final String key = e.getKey();
final String value = e.getValue();
putResultDataWithTemplate(dataMap, key, value, scriptConfigMap.get(key), scriptType);
});
}
use of org.codelibs.fess.crawler.entity.UrlQueue in project fess by codelibs.
the class AbstractFessFileTransformer method generateData.
protected Map<String, Object> generateData(final ResponseData responseData) {
final Extractor extractor = getExtractor(responseData);
final Map<String, String> params = new HashMap<>();
params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
final String mimeType = responseData.getMimeType();
params.put(HttpHeaders.CONTENT_TYPE, mimeType);
params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
final StringBuilder contentMetaBuf = new StringBuilder(1000);
final Map<String, Object> dataMap = new HashMap<>();
final Map<String, Object> metaDataMap = new HashMap<>();
String content;
try (final InputStream in = responseData.getResponseBody()) {
final ExtractData extractData = getExtractData(extractor, in, params);
content = extractData.getContent();
if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
return null;
}
if (getLogger().isDebugEnabled()) {
getLogger().debug("ExtractData: " + extractData);
}
// meta
//
extractData.getKeySet().stream().filter(//
k -> extractData.getValues(k) != null).forEach(key -> {
final String[] values = extractData.getValues(key);
metaDataMap.put(key, values);
if (fessConfig.isCrawlerMetadataContentIncluded(key)) {
final String joinedValue = StringUtils.join(values, ' ');
if (StringUtil.isNotBlank(joinedValue)) {
if (contentMetaBuf.length() > 0) {
contentMetaBuf.append(' ');
}
contentMetaBuf.append(joinedValue.trim());
}
}
final Pair<String, String> mapping = fessConfig.getCrawlerMetadataNameMapping(key);
if (mapping != null) {
if (Constants.MAPPING_TYPE_ARRAY.equalsIgnoreCase(mapping.getSecond())) {
dataMap.put(mapping.getFirst(), values);
} else if (Constants.MAPPING_TYPE_STRING.equalsIgnoreCase(mapping.getSecond())) {
final String joinedValue = StringUtils.join(values, ' ');
dataMap.put(mapping.getFirst(), joinedValue.trim());
} else if (values.length == 1) {
try {
if (Constants.MAPPING_TYPE_LONG.equalsIgnoreCase(mapping.getSecond())) {
dataMap.put(mapping.getFirst(), Long.parseLong(values[0]));
} else if (Constants.MAPPING_TYPE_DOUBLE.equalsIgnoreCase(mapping.getSecond())) {
dataMap.put(mapping.getFirst(), Double.parseDouble(values[0]));
} else {
logger.warn("Unknown mapping type: {}={}", key, mapping);
}
} catch (final NumberFormatException e) {
logger.warn("Failed to parse " + values[0], e);
}
}
}
});
} catch (final Exception e) {
final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
rcae.setLogLevel(CrawlingAccessException.WARN);
throw rcae;
}
if (content == null) {
content = StringUtil.EMPTY;
}
final String contentMeta = contentMetaBuf.toString().trim();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
String url = responseData.getUrl();
final String indexingTarget = crawlingConfig.getIndexingTarget(url);
url = pathMappingHelper.replaceUrl(sessionId, url);
final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
String urlEncoding;
final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
if (urlQueue != null && urlQueue.getEncoding() != null) {
urlEncoding = urlQueue.getEncoding();
} else {
urlEncoding = responseData.getCharSet();
}
// cid
final String configId = crawlingConfig.getConfigId();
if (configId != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
}
// expires
if (documentExpires != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
}
// segment
putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
// content
final StringBuilder buf = new StringBuilder(content.length() + 1000);
if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) {
buf.append(content);
}
if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) {
if (buf.length() > 0) {
buf.append(' ');
}
buf.append(contentMeta);
}
final String bodyBase = buf.toString().trim();
final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
// text cache
putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
}
}
// digest
putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
// title
final String fileName = getFileName(url, urlEncoding);
if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
if (url.endsWith("/")) {
if (StringUtil.isNotBlank(content)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
}
} else {
if (StringUtil.isBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
} else {
putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
}
}
}
// host
putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url));
// site
putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
// filename
if (StringUtil.isNotBlank(fileName)) {
putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
}
// url
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
// created
final Date now = systemHelper.getCurrentTime();
putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
// TODO anchor
putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), StringUtil.EMPTY);
// mimetype
putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
if (fileTypeHelper != null) {
// filetype
putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
}
// content_length
putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
// last_modified
final Date lastModified = responseData.getLastModified();
if (lastModified != null) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
} else {
// timestamp
putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
}
// indexingTarget
putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
// boost
putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
// label: labelType
final Set<String> labelTypeSet = new HashSet<>();
for (final String labelType : crawlingConfig.getLabelTypeValues()) {
labelTypeSet.add(labelType);
}
final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
// role: roleType
final List<String> roleTypeList = getRoleTypes(responseData);
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
// lang
if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) {
putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang());
}
// id
putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
// parentId
String parentUrl = responseData.getParentUrl();
if (StringUtil.isNotBlank(parentUrl)) {
parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
// set again
putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
}
// from config
final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
final Map<String, String> metaConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.META);
for (final Map.Entry<String, String> entry : metaConfigMap.entrySet()) {
final String key = entry.getKey();
final String[] values = entry.getValue().split(",");
for (final String value : values) {
putResultDataWithTemplate(dataMap, key, metaDataMap.get(value), scriptConfigMap.get(key));
}
}
final Map<String, String> valueConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.VALUE);
for (final Map.Entry<String, String> entry : valueConfigMap.entrySet()) {
final String key = entry.getKey();
putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key));
}
return dataMap;
}
Aggregations