use of org.codelibs.fess.crawler.transformer.Transformer in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: " + dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", Data: " + dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.crawler.transformer.Transformer in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: {}", dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(String::trim).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}", responseProcessor, dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(RequestData::getUrl).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.crawler.transformer.Transformer in project fess by codelibs.
the class IndexUpdater method processAccessResults.
private void processAccessResults(final DocList docList, final List<EsAccessResult> accessResultList, final List<EsAccessResult> arList) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final long maxDocumentRequestSize = Long.parseLong(fessConfig.getIndexerWebfsMaxDocumentRequestSize());
for (final EsAccessResult accessResult : arList) {
if (logger.isDebugEnabled()) {
logger.debug("Indexing {}", accessResult.getUrl());
}
accessResult.setStatus(Constants.DONE_STATUS);
accessResultList.add(accessResult);
if (accessResult.getHttpStatusCode() != 200) {
// invalid page
if (logger.isDebugEnabled()) {
logger.debug("Skipped. The response code is {}.", accessResult.getHttpStatusCode());
}
continue;
}
final long startTime = System.currentTimeMillis();
final AccessResultData<?> accessResultData = getAccessResultData(accessResult);
if (accessResultData != null) {
accessResult.setAccessResultData(null);
try {
final Transformer transformer = ComponentUtil.getComponent(accessResultData.getTransformerName());
if (transformer == null) {
// no transformer
logger.warn("No transformer: {}", accessResultData.getTransformerName());
continue;
}
@SuppressWarnings("unchecked") final Map<String, Object> map = (Map<String, Object>) transformer.getData(accessResultData);
if (map.isEmpty()) {
// no transformer
logger.warn("No data: {}", accessResult.getUrl());
continue;
}
if (Constants.FALSE.equals(map.get(Constants.INDEXING_TARGET))) {
if (logger.isDebugEnabled()) {
logger.debug("Skipped. This document is not a index target. ");
}
continue;
}
map.remove(Constants.INDEXING_TARGET);
updateDocument(map);
docList.add(ingest(accessResult, map));
final long contentSize = indexingHelper.calculateDocumentSize(map);
docList.addContentSize(contentSize);
final long processingTime = System.currentTimeMillis() - startTime;
docList.addProcessingTime(processingTime);
if (logger.isDebugEnabled()) {
logger.debug("Added the document({}, {}ms). The number of a document cache is {}.", MemoryUtil.byteCountToDisplaySize(contentSize), processingTime, docList.size());
}
if (docList.getContentSize() >= maxDocumentRequestSize) {
indexingHelper.sendDocuments(searchEngineClient, docList);
}
documentSize++;
if (logger.isDebugEnabled()) {
logger.debug("The number of an added document is {}.", documentSize);
}
} catch (final Exception e) {
logger.warn("Could not add a doc: {}", accessResult.getUrl(), e);
}
} else if (logger.isDebugEnabled()) {
logger.debug("Skipped. No content. ");
}
}
}
use of org.codelibs.fess.crawler.transformer.Transformer in project fess by codelibs.
the class DocumentHelper method processRequest.
public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
if (StringUtil.isBlank(crawlingInfoId)) {
throw new CrawlingAccessException("sessionId is null.");
}
final CrawlerClientFactory crawlerClientFactory = crawlingConfig.initializeClientFactory(ComponentUtil::getCrawlerClientFactory);
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new CrawlingAccessException("CrawlerClient is null for " + url);
}
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
final Set<RequestData> childUrlList = new HashSet<>();
childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
throw new ChildUrlsException(childUrlList, this.getClass().getName() + "#RedirectedFrom:" + url);
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setSessionId(crawlingInfoId);
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
throw new CrawlingAccessException("No url rule for " + url);
}
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (!(responseProcessor instanceof DefaultResponseProcessor)) {
throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
}
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
return (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
return null;
} catch (final Exception e) {
throw new CrawlingAccessException("Failed to parse " + url, e);
}
}
Aggregations