use of org.codelibs.fess.crawler.rule.Rule in project fess-crawler by codelibs.
the class RuleManagerImplTest method test_getRule_sitemaps4.
public void test_getRule_sitemaps4() {
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://www.example.com/sitemap1.txt.gz");
File file = ResourceUtil.getResourceAsFile("sitemaps/sitemap1.xml.gz");
responseData.setResponseBody(file, false);
final Rule rule = ruleManager.getRule(responseData);
assertNotNull(rule);
assertEquals("sitemapsRule", rule.getRuleId());
CloseableUtil.closeQuietly(responseData);
}
use of org.codelibs.fess.crawler.rule.Rule in project fess-crawler by codelibs.
the class CrawlerThread method processResponse.
protected void processResponse(final UrlQueue<?> urlQueue, final ResponseData responseData) {
// get a rule
final Rule rule = crawlerContext.ruleManager.getRule(responseData);
if (rule == null) {
log(logHelper, LogType.NO_RULE, crawlerContext, urlQueue, responseData);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor == null) {
log(logHelper, LogType.NO_RESPONSE_PROCESSOR, crawlerContext, urlQueue, responseData, rule);
} else {
responseProcessor.process(responseData);
}
}
}
use of org.codelibs.fess.crawler.rule.Rule in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: {}", dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(String::trim).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}", responseProcessor, dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(RequestData::getUrl).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.crawler.rule.Rule in project fess-crawler by codelibs.
the class RuleManagerImplTest method test_getRule_sitemaps3.
public void test_getRule_sitemaps3() {
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://www.example.com/sitemap1.txt");
File file = ResourceUtil.getResourceAsFile("sitemaps/sitemap1.txt");
responseData.setResponseBody(file, false);
final Rule rule = ruleManager.getRule(responseData);
assertNotNull(rule);
assertEquals("sitemapsRule", rule.getRuleId());
CloseableUtil.closeQuietly(responseData);
}
use of org.codelibs.fess.crawler.rule.Rule in project fess-crawler by codelibs.
the class RuleManagerImplTest method test_checkRule.
public void test_checkRule() {
final Rule rule = ruleManager.getRule(new ResponseData());
assertNotNull(rule);
assertEquals("fileRule", rule.getRuleId());
final RegexRule rule2 = new RegexRule();
rule2.setAllRequired(true);
rule2.addRule("url", "http:.*");
final RegexRule rule3 = new RegexRule();
rule3.addRule("url", "http:.*");
assertFalse(ruleManager.hasRule(rule2));
assertFalse(ruleManager.hasRule(rule3));
ruleManager.addRule(rule2);
assertTrue(ruleManager.hasRule(rule2));
assertFalse(ruleManager.hasRule(rule3));
ruleManager.addRule(rule3);
assertTrue(ruleManager.hasRule(rule2));
assertTrue(ruleManager.hasRule(rule3));
ruleManager.removeRule(rule2);
assertFalse(ruleManager.hasRule(rule2));
assertTrue(ruleManager.hasRule(rule3));
ruleManager.removeRule(rule3);
assertFalse(ruleManager.hasRule(rule2));
assertFalse(ruleManager.hasRule(rule3));
}
Aggregations