use of org.codelibs.fess.crawler.entity.UrlQueue in project fess-crawler by codelibs.
the class CrawlerTest method test_execute_2instanceTx.
public void test_execute_2instanceTx() throws Exception {
final CrawlerWebServer server1 = new CrawlerWebServer(7070);
server1.start();
final CrawlerWebServer server2 = new CrawlerWebServer(7071);
server2.start();
final String url1 = "http://localhost:7070/";
final String url2 = "http://localhost:7071/";
try {
final int maxCount = 10;
final int numOfThread = 10;
final File file = File.createTempFile("crawler-", "");
file.delete();
file.mkdirs();
file.deleteOnExit();
fileTransformer.setPath(file.getAbsolutePath());
final Crawler crawler1 = getComponent(Crawler.class);
crawler1.setBackground(true);
((UrlFilterImpl) crawler1.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
crawler1.addUrl(url1);
crawler1.getCrawlerContext().setMaxAccessCount(maxCount);
crawler1.getCrawlerContext().setNumOfThread(numOfThread);
Thread.sleep(100);
final Crawler crawler2 = getComponent(Crawler.class);
crawler2.setBackground(true);
((UrlFilterImpl) crawler2.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
crawler2.addUrl(url2);
crawler2.getCrawlerContext().setMaxAccessCount(maxCount);
crawler2.getCrawlerContext().setNumOfThread(numOfThread);
final String sessionId1 = crawler1.execute();
final String sessionId2 = crawler2.execute();
assertNotSame(sessionId1, sessionId2);
assertNotSame(crawler1.crawlerContext, crawler2.crawlerContext);
for (int i = 0; i < 10; i++) {
if (crawler1.crawlerContext.getStatus() == CrawlerStatus.RUNNING) {
break;
}
Thread.sleep(500);
}
assertEquals(CrawlerStatus.RUNNING, crawler1.crawlerContext.getStatus());
for (int i = 0; i < 10; i++) {
if (crawler2.crawlerContext.getStatus() == CrawlerStatus.RUNNING) {
break;
}
Thread.sleep(500);
}
assertEquals(CrawlerStatus.RUNNING, crawler2.crawlerContext.getStatus());
crawler1.awaitTermination();
crawler2.awaitTermination();
assertEquals(maxCount, dataService.getCount(sessionId1));
assertEquals(maxCount, dataService.getCount(sessionId2));
UrlQueue urlQueue;
while ((urlQueue = urlQueueService.poll(sessionId1)) != null) {
assertTrue(urlQueue.getUrl() + "=>" + url1, urlQueue.getUrl().startsWith(url1));
}
while ((urlQueue = urlQueueService.poll(sessionId2)) != null) {
assertTrue(urlQueue.getUrl() + "=>" + url2, urlQueue.getUrl().startsWith(url2));
}
dataService.iterate(sessionId1, accessResult -> assertTrue(accessResult.getUrl().startsWith(url1)));
dataService.iterate(sessionId2, accessResult -> assertTrue(accessResult.getUrl().startsWith(url2)));
dataService.delete(sessionId1);
dataService.delete(sessionId2);
} finally {
try {
server1.stop();
} finally {
server2.stop();
}
}
}
use of org.codelibs.fess.crawler.entity.UrlQueue in project fess-crawler by codelibs.
the class CrawlerTest method setUp.
@Override
protected void setUp() throws Exception {
super.setUp();
final Map<String, String> featureMap = newHashMap();
featureMap.put("http://xml.org/sax/features/namespaces", "false");
final Map<String, String> propertyMap = newHashMap();
final Map<String, String> childUrlRuleMap = newHashMap();
childUrlRuleMap.put("//A", "href");
childUrlRuleMap.put("//AREA", "href");
childUrlRuleMap.put("//FRAME", "src");
childUrlRuleMap.put("//IFRAME", "src");
childUrlRuleMap.put("//IMG", "src");
childUrlRuleMap.put("//LINK", "href");
childUrlRuleMap.put("//SCRIPT", "src");
container = new StandardCrawlerContainer();
container.<HcHttpClient>prototype("internalHttpClient", HcHttpClient.class, client -> {
client.setCookieSpec(CookieSpecs.BEST_MATCH);
client.setClientConnectionManager(container.getComponent("clientConnectionManager"));
}).prototype("httpClient", FaultTolerantClient.class, client -> {
client.setCrawlerClient(container.getComponent("internalHttpClient"));
client.setMaxRetryCount(5);
client.setRetryInterval(500);
}).prototype("fsClient", FileSystemClient.class).prototype("ruleManager", RuleManagerImpl.class, manager -> {
manager.addRule(container.getComponent("sitemapsRule"));
manager.addRule(container.getComponent("fileRule"));
}).prototype("accessResult", AccessResultImpl.class).prototype("urlQueue", UrlQueueImpl.class).prototype("crawlerThread", CrawlerThread.class).prototype("crawler", Crawler.class).prototype("urlFilterService", UrlFilterServiceImpl.class).prototype("urlQueueService", UrlQueueServiceImpl.class).prototype("dataService", DataServiceImpl.class).prototype("urlFilter", UrlFilterImpl.class).singleton("urlConvertHelper", UrlConvertHelper.class).singleton("intervalController", DefaultIntervalController.class).singleton("sitemapsHelper", SitemapsHelper.class).singleton("logHelper", LogHelperImpl.class).singleton("encodingHelper", EncodingHelper.class).singleton("contentLengthHelper", ContentLengthHelper.class).singleton("mimeTypeHelper", MimeTypeHelperImpl.class).<FileTransformer>singleton("fileTransformer", FileTransformer.class, transformer -> {
transformer.setName("fileTransformer");
transformer.setFeatureMap(featureMap);
transformer.setPropertyMap(propertyMap);
transformer.setChildUrlRuleMap(childUrlRuleMap);
}).singleton("dataHelper", MemoryDataHelper.class).singleton("robotsTxtHelper", RobotsTxtHelper.class).<CrawlerClientFactory>singleton("clientFactory", CrawlerClientFactory.class, factory -> {
factory.addClient("http:.*", container.getComponent("httpClient"));
factory.addClient("file:.*", container.getComponent("fsClient"));
}).singleton("tikaExtractor", TikaExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
factory.addExtractor("text/plain", tikaExtractor);
factory.addExtractor("text/html", tikaExtractor);
}).singleton("httpClient", //
HcHttpClient.class).singleton("sitemapsResponseProcessor", //
SitemapsResponseProcessor.class).<SitemapsRule>singleton("sitemapsRule", SitemapsRule.class, rule -> {
rule.setResponseProcessor(container.getComponent("sitemapsResponseProcessor"));
rule.setRuleId("sitemapsRule");
rule.addRule("url", ".*sitemap.*");
}).<//
DefaultResponseProcessor>singleton("defaultResponseProcessor", DefaultResponseProcessor.class, processor -> {
processor.setTransformer(container.getComponent("fileTransformer"));
processor.setSuccessfulHttpCodes(new int[] { 200 });
processor.setNotModifiedHttpCodes(new int[] { 304 });
}).<//
RegexRule>singleton("fileRule", RegexRule.class, rule -> {
rule.setRuleId("fileRule");
rule.setDefaultRule(true);
rule.setResponseProcessor(container.getComponent("defaultResponseProcessor"));
}).<//
PoolingHttpClientConnectionManager>singleton("clientConnectionManager", new PoolingHttpClientConnectionManager(5, TimeUnit.MINUTES), manager -> {
manager.setMaxTotal(200);
manager.setDefaultMaxPerRoute(20);
});
crawler = container.getComponent("crawler");
dataService = container.getComponent("dataService");
urlQueueService = container.getComponent("urlQueueService");
fileTransformer = container.getComponent("fileTransformer");
}
use of org.codelibs.fess.crawler.entity.UrlQueue in project fess-crawler by codelibs.
the class CrawlerTest method test_execute_2instance.
public void test_execute_2instance() throws Exception {
final CrawlerWebServer server1 = new CrawlerWebServer(7070);
server1.start();
final CrawlerWebServer server2 = new CrawlerWebServer(7071);
server2.start();
final String url1 = "http://localhost:7070/";
final String url2 = "http://localhost:7071/";
try {
final int maxCount = 10;
final int numOfThread = 10;
final File file = File.createTempFile("crawler-", "");
file.delete();
file.mkdirs();
file.deleteOnExit();
fileTransformer.setPath(file.getAbsolutePath());
final Crawler crawler1 = container.getComponent("crawler");
crawler1.setSessionId(crawler1.getSessionId() + "1");
crawler1.setBackground(true);
((UrlFilterImpl) crawler1.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
crawler1.addUrl(url1);
crawler1.getCrawlerContext().setMaxAccessCount(maxCount);
crawler1.getCrawlerContext().setNumOfThread(numOfThread);
final Crawler crawler2 = container.getComponent("crawler");
crawler2.setSessionId(crawler2.getSessionId() + "2");
crawler2.setBackground(true);
((UrlFilterImpl) crawler2.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
crawler2.addUrl(url2);
crawler2.getCrawlerContext().setMaxAccessCount(maxCount);
crawler2.getCrawlerContext().setNumOfThread(numOfThread);
final String sessionId1 = crawler1.execute();
final String sessionId2 = crawler2.execute();
assertNotSame(sessionId1, sessionId2);
assertNotSame(crawler1.crawlerContext, crawler2.crawlerContext);
Thread.sleep(1000);
assertEquals(CrawlerStatus.RUNNING, crawler1.crawlerContext.getStatus());
assertEquals(CrawlerStatus.RUNNING, crawler2.crawlerContext.getStatus());
crawler1.awaitTermination();
crawler2.awaitTermination();
assertEquals(maxCount, dataService.getCount(sessionId1));
assertEquals(maxCount, dataService.getCount(sessionId2));
UrlQueue urlQueue;
while ((urlQueue = urlQueueService.poll(sessionId1)) != null) {
assertTrue(urlQueue.getUrl().startsWith(url1));
}
while ((urlQueue = urlQueueService.poll(sessionId2)) != null) {
assertTrue(urlQueue.getUrl().startsWith(url2));
}
dataService.iterate(sessionId1, accessResult -> {
assertTrue(accessResult.getUrl().startsWith(url1));
assertEquals(Constants.GET_METHOD, accessResult.getMethod());
});
dataService.iterate(sessionId2, accessResult -> {
assertTrue(accessResult.getUrl().startsWith(url2));
assertEquals(Constants.GET_METHOD, accessResult.getMethod());
});
dataService.delete(sessionId1);
dataService.delete(sessionId2);
} finally {
try {
server1.stop();
} finally {
server2.stop();
}
}
}
use of org.codelibs.fess.crawler.entity.UrlQueue in project fess by codelibs.
the class FessCrawlerThread method isContentUpdated.
@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
final long startTime = System.currentTimeMillis();
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
final String url = urlQueue.getUrl();
ResponseData responseData = null;
try {
final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
final Map<String, Object> dataMap = new HashMap<>();
dataMap.put(fessConfig.getIndexFieldUrl(), url);
final List<String> roleTypeList = new ArrayList<>();
stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
if (url.startsWith("smb:") || url.startsWith("smb1:") || url.startsWith("file:") || url.startsWith("ftp:")) {
if (url.endsWith("/")) {
// directory
return true;
}
final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
roleTypeList.addAll(permissionHelper.getSmbRoleTypeList(responseData));
roleTypeList.addAll(permissionHelper.getFileRoleTypeList(responseData));
roleTypeList.addAll(permissionHelper.getFtpRoleTypeList(responseData));
}
}
dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
final String id = crawlingInfoHelper.generateId(dataMap);
if (logger.isDebugEnabled()) {
logger.debug("Searching indexed document: {}", id);
}
final Map<String, Object> document = indexingHelper.getDocument(searchEngineClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
if (document == null) {
storeChildUrlsToQueue(urlQueue, getChildUrlSet(searchEngineClient, id));
return true;
}
final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
if (expires != null && expires.getTime() < System.currentTimeMillis()) {
final Object idValue = document.get(fessConfig.getIndexFieldId());
if (idValue != null && !indexingHelper.deleteDocument(searchEngineClient, idValue.toString())) {
logger.debug("Failed to delete expired document: {}", url);
}
return true;
}
final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
if (lastModified == null) {
return true;
}
urlQueue.setLastModified(lastModified.getTime());
log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
if (responseData == null) {
// head method
responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
if (responseData == null) {
return true;
}
}
final int httpStatusCode = responseData.getHttpStatusCode();
if (logger.isDebugEnabled()) {
logger.debug("Accessing document: {}, status: {}", url, httpStatusCode);
}
if (httpStatusCode == 404) {
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
if (!indexingHelper.deleteDocument(searchEngineClient, id)) {
logger.debug("Failed to delete 404 document: {}", url);
}
return false;
}
if (responseData.getLastModified() == null) {
return true;
}
if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setParentUrl(urlQueue.getParentUrl());
responseData.setSessionId(crawlerContext.getSessionId());
responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
processResponse(urlQueue, responseData);
storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
if (documentExpires != null && !indexingHelper.updateDocument(searchEngineClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
logger.debug("Failed to update {} at {}", fessConfig.getIndexFieldExpires(), url);
}
return false;
}
} finally {
if (responseData != null) {
CloseableUtil.closeQuietly(responseData);
}
}
}
return true;
}
Aggregations