use of org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: " + dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", Data: " + dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: {}", dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(String::trim).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}", responseProcessor, dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(RequestData::getUrl).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor in project fess-crawler by codelibs.
the class CrawlerTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
final Map<String, String> featureMap = newHashMap();
featureMap.put("http://xml.org/sax/features/namespaces", "false");
final Map<String, String> propertyMap = newHashMap();
final Map<String, String> childUrlRuleMap = newHashMap();
childUrlRuleMap.put("//A", "href");
childUrlRuleMap.put("//AREA", "href");
childUrlRuleMap.put("//FRAME", "src");
childUrlRuleMap.put("//IFRAME", "src");
childUrlRuleMap.put("//IMG", "src");
childUrlRuleMap.put("//LINK", "href");
childUrlRuleMap.put("//SCRIPT", "src");
container = new StandardCrawlerContainer();
container.<HcHttpClient>prototype("internalHttpClient", HcHttpClient.class, client -> {
client.setCookieSpec(CookieSpecs.BEST_MATCH);
client.setClientConnectionManager(container.getComponent("clientConnectionManager"));
}).prototype("httpClient", FaultTolerantClient.class, client -> {
client.setCrawlerClient(container.getComponent("internalHttpClient"));
client.setMaxRetryCount(5);
client.setRetryInterval(500);
}).prototype("fsClient", FileSystemClient.class).prototype("ruleManager", RuleManagerImpl.class, manager -> {
manager.addRule(container.getComponent("sitemapsRule"));
manager.addRule(container.getComponent("fileRule"));
}).prototype("accessResult", AccessResultImpl.class).prototype("urlQueue", UrlQueueImpl.class).prototype("crawlerThread", CrawlerThread.class).prototype("crawler", Crawler.class).prototype("urlFilterService", UrlFilterServiceImpl.class).prototype("urlQueueService", UrlQueueServiceImpl.class).prototype("dataService", DataServiceImpl.class).prototype("urlFilter", UrlFilterImpl.class).singleton("urlConvertHelper", UrlConvertHelper.class).singleton("intervalController", DefaultIntervalController.class).singleton("sitemapsHelper", SitemapsHelper.class).singleton("logHelper", LogHelperImpl.class).singleton("encodingHelper", EncodingHelper.class).singleton("contentLengthHelper", ContentLengthHelper.class).singleton("mimeTypeHelper", MimeTypeHelperImpl.class).<FileTransformer>singleton("fileTransformer", FileTransformer.class, transformer -> {
transformer.setName("fileTransformer");
transformer.setFeatureMap(featureMap);
transformer.setPropertyMap(propertyMap);
transformer.setChildUrlRuleMap(childUrlRuleMap);
}).singleton("dataHelper", MemoryDataHelper.class).singleton("robotsTxtHelper", RobotsTxtHelper.class).<CrawlerClientFactory>singleton("clientFactory", CrawlerClientFactory.class, factory -> {
factory.addClient("http:.*", container.getComponent("httpClient"));
factory.addClient("file:.*", container.getComponent("fsClient"));
}).singleton("tikaExtractor", TikaExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
factory.addExtractor("text/plain", tikaExtractor);
factory.addExtractor("text/html", tikaExtractor);
}).singleton("httpClient", //
HcHttpClient.class).singleton("sitemapsResponseProcessor", //
SitemapsResponseProcessor.class).<SitemapsRule>singleton("sitemapsRule", SitemapsRule.class, rule -> {
rule.setResponseProcessor(container.getComponent("sitemapsResponseProcessor"));
rule.setRuleId("sitemapsRule");
rule.addRule("url", ".*sitemap.*");
}).<//
DefaultResponseProcessor>singleton("defaultResponseProcessor", DefaultResponseProcessor.class, processor -> {
processor.setTransformer(container.getComponent("fileTransformer"));
processor.setSuccessfulHttpCodes(new int[] { 200 });
processor.setNotModifiedHttpCodes(new int[] { 304 });
}).<//
RegexRule>singleton("fileRule", RegexRule.class, rule -> {
rule.setRuleId("fileRule");
rule.setDefaultRule(true);
rule.setResponseProcessor(container.getComponent("defaultResponseProcessor"));
}).<//
PoolingHttpClientConnectionManager>singleton("clientConnectionManager", new PoolingHttpClientConnectionManager(5, TimeUnit.MINUTES), manager -> {
manager.setMaxTotal(200);
manager.setDefaultMaxPerRoute(20);
});
crawler = container.getComponent("crawler");
dataService = container.getComponent("dataService");
urlQueueService = container.getComponent("urlQueueService");
fileTransformer = container.getComponent("fileTransformer");
}
use of org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor in project fess by codelibs.
the class DocumentHelper method processRequest.
public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
if (StringUtil.isBlank(crawlingInfoId)) {
throw new CrawlingAccessException("sessionId is null.");
}
final CrawlerClientFactory crawlerClientFactory = crawlingConfig.initializeClientFactory(ComponentUtil::getCrawlerClientFactory);
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new CrawlingAccessException("CrawlerClient is null for " + url);
}
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
final Set<RequestData> childUrlList = new HashSet<>();
childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
throw new ChildUrlsException(childUrlList, this.getClass().getName() + "#RedirectedFrom:" + url);
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setSessionId(crawlingInfoId);
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
throw new CrawlingAccessException("No url rule for " + url);
}
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (!(responseProcessor instanceof DefaultResponseProcessor)) {
throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
}
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
return (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
return null;
} catch (final Exception e) {
throw new CrawlingAccessException("Failed to parse " + url, e);
}
}
Aggregations