use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: " + dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", Data: " + dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method addDocument.
protected void addDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
synchronized (indexUpdateCallback) {
// required check
if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) {
logger.warn("Could not add a doc. Invalid data: " + dataMap);
return;
}
final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
logger.warn("CrawlerClient is null. Data: " + dataMap);
return;
}
String processingUrl = url;
for (int i = 0; i < maxRedirectCount; i++) {
processingUrl = processRequest(paramMap, dataMap, processingUrl, client);
if (processingUrl == null) {
break;
}
dataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
}
}
}
use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.
the class BaseThumbnailGenerator method process.
protected boolean process(final String id, final Predicate<ResponseData> consumer) {
return process(id, (configId, url) -> {
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingConfig config = crawlingConfigHelper.getCrawlingConfig(configId);
if (config == null) {
throw new ThumbnailGenerationException("No CrawlingConfig: " + configId);
}
if (logger.isInfoEnabled()) {
logger.info("Generating Thumbnail: {}", url);
}
final CrawlerClientFactory crawlerClientFactory = config.initializeClientFactory(() -> ComponentUtil.getComponent(CrawlerClientFactory.class));
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new ThumbnailGenerationException("No CrawlerClient: " + configId + ", url: " + url);
}
String u = url;
for (int i = 0; i < maxRedirectCount; i++) {
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(u).build())) {
if (StringUtil.isNotBlank(responseData.getRedirectLocation())) {
u = responseData.getRedirectLocation();
continue;
}
if (StringUtil.isBlank(responseData.getUrl())) {
throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url + " (Response URL is empty)");
}
return consumer.test(responseData);
} catch (final CrawlingAccessException e) {
if (logger.isDebugEnabled()) {
throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url, e);
}
throw new ThumbnailGenerationException(e.getMessage());
} catch (final Exception e) {
throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url, e);
}
}
throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url + " (Redirect Loop)");
});
}
use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method addDocument.
protected void addDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
synchronized (indexUpdateCallback) {
// required check
if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) {
logger.warn("Could not add a doc. Invalid data: {}", dataMap);
return;
}
final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
logger.warn("CrawlerClient is null. Data: {}", dataMap);
return;
}
final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
long counter = 0;
final Deque<String> urlQueue = new LinkedList<>();
urlQueue.offer(url);
while (!urlQueue.isEmpty() && (maxAccessCount < 0 || counter < maxAccessCount)) {
final Map<String, Object> localDataMap = dataMap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
String processingUrl = urlQueue.poll();
if (deleteUrlList.contains(processingUrl)) {
// delete before indexing
deleteDocuments();
}
try {
for (int i = 0; i < maxRedirectCount; i++) {
processingUrl = processRequest(paramMap, localDataMap, processingUrl, client);
if (processingUrl == null) {
break;
}
counter++;
localDataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
}
} catch (final ChildUrlsException e) {
e.getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
} catch (final DataStoreCrawlingException e) {
final Throwable cause = e.getCause();
if (cause instanceof ChildUrlsException) {
((ChildUrlsException) cause).getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
} else if (maxAccessCount != 1L) {
throw e;
} else {
logger.warn("Failed to access {}.", processingUrl, e);
}
}
}
}
}
use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: {}", dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(String::trim).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}", responseProcessor, dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(RequestData::getUrl).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
Aggregations