use of org.codelibs.fess.crawler.helper.SitemapsHelper in project fess-crawler by codelibs.
the class SitemapsRuleTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("sitemapsHelper", //
SitemapsHelper.class).singleton("sitemapsRule", SitemapsRule.class);
sitemapsRule = container.getComponent("sitemapsRule");
}
use of org.codelibs.fess.crawler.helper.SitemapsHelper in project fess-crawler by codelibs.
the class CrawlerTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
final Map<String, String> featureMap = newHashMap();
featureMap.put("http://xml.org/sax/features/namespaces", "false");
final Map<String, String> propertyMap = newHashMap();
final Map<String, String> childUrlRuleMap = newHashMap();
childUrlRuleMap.put("//A", "href");
childUrlRuleMap.put("//AREA", "href");
childUrlRuleMap.put("//FRAME", "src");
childUrlRuleMap.put("//IFRAME", "src");
childUrlRuleMap.put("//IMG", "src");
childUrlRuleMap.put("//LINK", "href");
childUrlRuleMap.put("//SCRIPT", "src");
container = new StandardCrawlerContainer();
container.<HcHttpClient>prototype("internalHttpClient", HcHttpClient.class, client -> {
client.setCookieSpec(CookieSpecs.BEST_MATCH);
client.setClientConnectionManager(container.getComponent("clientConnectionManager"));
}).prototype("httpClient", FaultTolerantClient.class, client -> {
client.setCrawlerClient(container.getComponent("internalHttpClient"));
client.setMaxRetryCount(5);
client.setRetryInterval(500);
}).prototype("fsClient", FileSystemClient.class).prototype("ruleManager", RuleManagerImpl.class, manager -> {
manager.addRule(container.getComponent("sitemapsRule"));
manager.addRule(container.getComponent("fileRule"));
}).prototype("accessResult", AccessResultImpl.class).prototype("urlQueue", UrlQueueImpl.class).prototype("crawlerThread", CrawlerThread.class).prototype("crawler", Crawler.class).prototype("urlFilterService", UrlFilterServiceImpl.class).prototype("urlQueueService", UrlQueueServiceImpl.class).prototype("dataService", DataServiceImpl.class).prototype("urlFilter", UrlFilterImpl.class).singleton("urlConvertHelper", UrlConvertHelper.class).singleton("intervalController", DefaultIntervalController.class).singleton("sitemapsHelper", SitemapsHelper.class).singleton("logHelper", LogHelperImpl.class).singleton("encodingHelper", EncodingHelper.class).singleton("contentLengthHelper", ContentLengthHelper.class).singleton("mimeTypeHelper", MimeTypeHelperImpl.class).<FileTransformer>singleton("fileTransformer", FileTransformer.class, transformer -> {
transformer.setName("fileTransformer");
transformer.setFeatureMap(featureMap);
transformer.setPropertyMap(propertyMap);
transformer.setChildUrlRuleMap(childUrlRuleMap);
}).singleton("dataHelper", MemoryDataHelper.class).singleton("robotsTxtHelper", RobotsTxtHelper.class).<CrawlerClientFactory>singleton("clientFactory", CrawlerClientFactory.class, factory -> {
factory.addClient("http:.*", container.getComponent("httpClient"));
factory.addClient("file:.*", container.getComponent("fsClient"));
}).singleton("tikaExtractor", TikaExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
factory.addExtractor("text/plain", tikaExtractor);
factory.addExtractor("text/html", tikaExtractor);
}).singleton("httpClient", //
HcHttpClient.class).singleton("sitemapsResponseProcessor", //
SitemapsResponseProcessor.class).<SitemapsRule>singleton("sitemapsRule", SitemapsRule.class, rule -> {
rule.setResponseProcessor(container.getComponent("sitemapsResponseProcessor"));
rule.setRuleId("sitemapsRule");
rule.addRule("url", ".*sitemap.*");
}).<//
DefaultResponseProcessor>singleton("defaultResponseProcessor", DefaultResponseProcessor.class, processor -> {
processor.setTransformer(container.getComponent("fileTransformer"));
processor.setSuccessfulHttpCodes(new int[] { 200 });
processor.setNotModifiedHttpCodes(new int[] { 304 });
}).<//
RegexRule>singleton("fileRule", RegexRule.class, rule -> {
rule.setRuleId("fileRule");
rule.setDefaultRule(true);
rule.setResponseProcessor(container.getComponent("defaultResponseProcessor"));
}).<//
PoolingHttpClientConnectionManager>singleton("clientConnectionManager", new PoolingHttpClientConnectionManager(5, TimeUnit.MINUTES), manager -> {
manager.setMaxTotal(200);
manager.setDefaultMaxPerRoute(20);
});
crawler = container.getComponent("crawler");
dataService = container.getComponent("dataService");
urlQueueService = container.getComponent("urlQueueService");
fileTransformer = container.getComponent("fileTransformer");
}
use of org.codelibs.fess.crawler.helper.SitemapsHelper in project fess-crawler by codelibs.
the class SitemapsResponseProcessor method process.
@Override
public void process(final ResponseData responseData) {
final SitemapsHelper sitemapsHelper = crawlerContainer.getComponent("sitemapsHelper");
try (final InputStream responseBody = responseData.getResponseBody()) {
final SitemapSet sitemapSet = sitemapsHelper.parse(responseBody);
final Set<RequestData> requestDataSet = new LinkedHashSet<>();
for (final Sitemap sitemap : sitemapSet.getSitemaps()) {
if (sitemap != null) {
requestDataSet.add(RequestDataBuilder.newRequestData().get().url(sitemap.getLoc()).build());
}
}
throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#process");
} catch (final IOException e) {
throw new IORuntimeException(e);
}
}
use of org.codelibs.fess.crawler.helper.SitemapsHelper in project fess-crawler by codelibs.
the class RuleManagerImplTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
StandardCrawlerContainer container = new StandardCrawlerContainer().singleton("sitemapsHelper", //
SitemapsHelper.class).singleton("sitemapsRule", //
SitemapsRule.class).singleton("fileRule", //
RegexRule.class).singleton("ruleManager", RuleManagerImpl.class);
ruleManager = container.getComponent("ruleManager");
SitemapsRule sitemapsRule = container.getComponent("sitemapsRule");
sitemapsRule.setRuleId("sitemapsRule");
sitemapsRule.addRule("url", ".*sitemap.*");
ruleManager.addRule(sitemapsRule);
RegexRule fileRule = container.getComponent("fileRule");
fileRule.setRuleId("fileRule");
fileRule.setDefaultRule(true);
ruleManager.addRule(fileRule);
}
Aggregations