Search in sources :

Example 6 with UrlQueue

use of org.codelibs.fess.crawler.entity.UrlQueue in project fess-crawler by codelibs.

the class CrawlerTest method test_execute_2instanceTx.

public void test_execute_2instanceTx() throws Exception {
    final CrawlerWebServer server1 = new CrawlerWebServer(7070);
    server1.start();
    final CrawlerWebServer server2 = new CrawlerWebServer(7071);
    server2.start();
    final String url1 = "http://localhost:7070/";
    final String url2 = "http://localhost:7071/";
    try {
        final int maxCount = 10;
        final int numOfThread = 10;
        final File file = File.createTempFile("crawler-", "");
        file.delete();
        file.mkdirs();
        file.deleteOnExit();
        fileTransformer.setPath(file.getAbsolutePath());
        final Crawler crawler1 = getComponent(Crawler.class);
        crawler1.setBackground(true);
        ((UrlFilterImpl) crawler1.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
        crawler1.addUrl(url1);
        crawler1.getCrawlerContext().setMaxAccessCount(maxCount);
        crawler1.getCrawlerContext().setNumOfThread(numOfThread);
        Thread.sleep(100);
        final Crawler crawler2 = getComponent(Crawler.class);
        crawler2.setBackground(true);
        ((UrlFilterImpl) crawler2.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
        crawler2.addUrl(url2);
        crawler2.getCrawlerContext().setMaxAccessCount(maxCount);
        crawler2.getCrawlerContext().setNumOfThread(numOfThread);
        final String sessionId1 = crawler1.execute();
        final String sessionId2 = crawler2.execute();
        assertNotSame(sessionId1, sessionId2);
        assertNotSame(crawler1.crawlerContext, crawler2.crawlerContext);
        for (int i = 0; i < 10; i++) {
            if (crawler1.crawlerContext.getStatus() == CrawlerStatus.RUNNING) {
                break;
            }
            Thread.sleep(500);
        }
        assertEquals(CrawlerStatus.RUNNING, crawler1.crawlerContext.getStatus());
        for (int i = 0; i < 10; i++) {
            if (crawler2.crawlerContext.getStatus() == CrawlerStatus.RUNNING) {
                break;
            }
            Thread.sleep(500);
        }
        assertEquals(CrawlerStatus.RUNNING, crawler2.crawlerContext.getStatus());
        crawler1.awaitTermination();
        crawler2.awaitTermination();
        assertEquals(maxCount, dataService.getCount(sessionId1));
        assertEquals(maxCount, dataService.getCount(sessionId2));
        UrlQueue urlQueue;
        while ((urlQueue = urlQueueService.poll(sessionId1)) != null) {
            assertTrue(urlQueue.getUrl() + "=>" + url1, urlQueue.getUrl().startsWith(url1));
        }
        while ((urlQueue = urlQueueService.poll(sessionId2)) != null) {
            assertTrue(urlQueue.getUrl() + "=>" + url2, urlQueue.getUrl().startsWith(url2));
        }
        dataService.iterate(sessionId1, accessResult -> assertTrue(accessResult.getUrl().startsWith(url1)));
        dataService.iterate(sessionId2, accessResult -> assertTrue(accessResult.getUrl().startsWith(url2)));
        dataService.delete(sessionId1);
        dataService.delete(sessionId2);
    } finally {
        try {
            server1.stop();
        } finally {
            server2.stop();
        }
    }
}
Also used : UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) CrawlerWebServer(org.codelibs.fess.crawler.util.CrawlerWebServer) Crawler(org.codelibs.fess.crawler.Crawler) File(java.io.File)

Example 7 with UrlQueue

use of org.codelibs.fess.crawler.entity.UrlQueue in project fess-crawler by codelibs.

the class CrawlerTest method setUp.

@Override
protected void setUp() throws Exception {
    super.setUp();
    final Map<String, String> featureMap = newHashMap();
    featureMap.put("http://xml.org/sax/features/namespaces", "false");
    final Map<String, String> propertyMap = newHashMap();
    final Map<String, String> childUrlRuleMap = newHashMap();
    childUrlRuleMap.put("//A", "href");
    childUrlRuleMap.put("//AREA", "href");
    childUrlRuleMap.put("//FRAME", "src");
    childUrlRuleMap.put("//IFRAME", "src");
    childUrlRuleMap.put("//IMG", "src");
    childUrlRuleMap.put("//LINK", "href");
    childUrlRuleMap.put("//SCRIPT", "src");
    container = new StandardCrawlerContainer();
    container.<HcHttpClient>prototype("internalHttpClient", HcHttpClient.class, client -> {
        client.setCookieSpec(CookieSpecs.BEST_MATCH);
        client.setClientConnectionManager(container.getComponent("clientConnectionManager"));
    }).prototype("httpClient", FaultTolerantClient.class, client -> {
        client.setCrawlerClient(container.getComponent("internalHttpClient"));
        client.setMaxRetryCount(5);
        client.setRetryInterval(500);
    }).prototype("fsClient", FileSystemClient.class).prototype("ruleManager", RuleManagerImpl.class, manager -> {
        manager.addRule(container.getComponent("sitemapsRule"));
        manager.addRule(container.getComponent("fileRule"));
    }).prototype("accessResult", AccessResultImpl.class).prototype("urlQueue", UrlQueueImpl.class).prototype("crawlerThread", CrawlerThread.class).prototype("crawler", Crawler.class).prototype("urlFilterService", UrlFilterServiceImpl.class).prototype("urlQueueService", UrlQueueServiceImpl.class).prototype("dataService", DataServiceImpl.class).prototype("urlFilter", UrlFilterImpl.class).singleton("urlConvertHelper", UrlConvertHelper.class).singleton("intervalController", DefaultIntervalController.class).singleton("sitemapsHelper", SitemapsHelper.class).singleton("logHelper", LogHelperImpl.class).singleton("encodingHelper", EncodingHelper.class).singleton("contentLengthHelper", ContentLengthHelper.class).singleton("mimeTypeHelper", MimeTypeHelperImpl.class).<FileTransformer>singleton("fileTransformer", FileTransformer.class, transformer -> {
        transformer.setName("fileTransformer");
        transformer.setFeatureMap(featureMap);
        transformer.setPropertyMap(propertyMap);
        transformer.setChildUrlRuleMap(childUrlRuleMap);
    }).singleton("dataHelper", MemoryDataHelper.class).singleton("robotsTxtHelper", RobotsTxtHelper.class).<CrawlerClientFactory>singleton("clientFactory", CrawlerClientFactory.class, factory -> {
        factory.addClient("http:.*", container.getComponent("httpClient"));
        factory.addClient("file:.*", container.getComponent("fsClient"));
    }).singleton("tikaExtractor", TikaExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
        TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
        factory.addExtractor("text/plain", tikaExtractor);
        factory.addExtractor("text/html", tikaExtractor);
    }).singleton("httpClient", // 
    HcHttpClient.class).singleton("sitemapsResponseProcessor", // 
    SitemapsResponseProcessor.class).<SitemapsRule>singleton("sitemapsRule", SitemapsRule.class, rule -> {
        rule.setResponseProcessor(container.getComponent("sitemapsResponseProcessor"));
        rule.setRuleId("sitemapsRule");
        rule.addRule("url", ".*sitemap.*");
    }).<// 
    DefaultResponseProcessor>singleton("defaultResponseProcessor", DefaultResponseProcessor.class, processor -> {
        processor.setTransformer(container.getComponent("fileTransformer"));
        processor.setSuccessfulHttpCodes(new int[] { 200 });
        processor.setNotModifiedHttpCodes(new int[] { 304 });
    }).<// 
    RegexRule>singleton("fileRule", RegexRule.class, rule -> {
        rule.setRuleId("fileRule");
        rule.setDefaultRule(true);
        rule.setResponseProcessor(container.getComponent("defaultResponseProcessor"));
    }).<// 
    PoolingHttpClientConnectionManager>singleton("clientConnectionManager", new PoolingHttpClientConnectionManager(5, TimeUnit.MINUTES), manager -> {
        manager.setMaxTotal(200);
        manager.setDefaultMaxPerRoute(20);
    });
    crawler = container.getComponent("crawler");
    dataService = container.getComponent("dataService");
    urlQueueService = container.getComponent("urlQueueService");
    fileTransformer = container.getComponent("fileTransformer");
}
Also used : StandardCrawlerContainer(org.codelibs.fess.crawler.container.StandardCrawlerContainer) MimeTypeHelperImpl(org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl) HcHttpClient(org.codelibs.fess.crawler.client.http.HcHttpClient) UrlQueueImpl(org.codelibs.fess.crawler.entity.UrlQueueImpl) PlainTestCase(org.dbflute.utflute.core.PlainTestCase) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) FileSystemClient(org.codelibs.fess.crawler.client.fs.FileSystemClient) DataService(org.codelibs.fess.crawler.service.DataService) CookieSpecs(org.apache.http.client.config.CookieSpecs) SitemapsHelper(org.codelibs.fess.crawler.helper.SitemapsHelper) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) RuleManagerImpl(org.codelibs.fess.crawler.rule.impl.RuleManagerImpl) DataServiceImpl(org.codelibs.fess.crawler.service.impl.DataServiceImpl) RegexRule(org.codelibs.fess.crawler.rule.impl.RegexRule) FaultTolerantClient(org.codelibs.fess.crawler.client.FaultTolerantClient) LogHelperImpl(org.codelibs.fess.crawler.helper.impl.LogHelperImpl) SitemapsRule(org.codelibs.fess.crawler.rule.impl.SitemapsRule) Map(java.util.Map) UrlFilterServiceImpl(org.codelibs.fess.crawler.service.impl.UrlFilterServiceImpl) PoolingHttpClientConnectionManager(org.apache.http.impl.conn.PoolingHttpClientConnectionManager) CrawlerWebServer(org.codelibs.fess.crawler.util.CrawlerWebServer) ContentLengthHelper(org.codelibs.fess.crawler.helper.ContentLengthHelper) EncodingHelper(org.codelibs.fess.crawler.helper.EncodingHelper) RobotsTxtHelper(org.codelibs.fess.crawler.helper.RobotsTxtHelper) AccessResultImpl(org.codelibs.fess.crawler.entity.AccessResultImpl) SitemapsResponseProcessor(org.codelibs.fess.crawler.processor.impl.SitemapsResponseProcessor) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) MemoryDataHelper(org.codelibs.fess.crawler.helper.MemoryDataHelper) DefaultIntervalController(org.codelibs.fess.crawler.interval.impl.DefaultIntervalController) ResourceUtil(org.codelibs.core.io.ResourceUtil) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor) File(java.io.File) FileTransformer(org.codelibs.fess.crawler.transformer.impl.FileTransformer) TimeUnit(java.util.concurrent.TimeUnit) UrlQueueService(org.codelibs.fess.crawler.service.UrlQueueService) UrlConvertHelper(org.codelibs.fess.crawler.helper.UrlConvertHelper) UrlQueueServiceImpl(org.codelibs.fess.crawler.service.impl.UrlQueueServiceImpl) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) LogHelperImpl(org.codelibs.fess.crawler.helper.impl.LogHelperImpl) ContentLengthHelper(org.codelibs.fess.crawler.helper.ContentLengthHelper) HcHttpClient(org.codelibs.fess.crawler.client.http.HcHttpClient) DefaultIntervalController(org.codelibs.fess.crawler.interval.impl.DefaultIntervalController) RobotsTxtHelper(org.codelibs.fess.crawler.helper.RobotsTxtHelper) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) StandardCrawlerContainer(org.codelibs.fess.crawler.container.StandardCrawlerContainer) UrlQueueImpl(org.codelibs.fess.crawler.entity.UrlQueueImpl) RuleManagerImpl(org.codelibs.fess.crawler.rule.impl.RuleManagerImpl) PoolingHttpClientConnectionManager(org.apache.http.impl.conn.PoolingHttpClientConnectionManager) FaultTolerantClient(org.codelibs.fess.crawler.client.FaultTolerantClient) FileTransformer(org.codelibs.fess.crawler.transformer.impl.FileTransformer) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) SitemapsRule(org.codelibs.fess.crawler.rule.impl.SitemapsRule) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor) RegexRule(org.codelibs.fess.crawler.rule.impl.RegexRule) UrlQueueServiceImpl(org.codelibs.fess.crawler.service.impl.UrlQueueServiceImpl)

Example 8 with UrlQueue

use of org.codelibs.fess.crawler.entity.UrlQueue in project fess-crawler by codelibs.

the class CrawlerTest method test_execute_2instance.

public void test_execute_2instance() throws Exception {
    final CrawlerWebServer server1 = new CrawlerWebServer(7070);
    server1.start();
    final CrawlerWebServer server2 = new CrawlerWebServer(7071);
    server2.start();
    final String url1 = "http://localhost:7070/";
    final String url2 = "http://localhost:7071/";
    try {
        final int maxCount = 10;
        final int numOfThread = 10;
        final File file = File.createTempFile("crawler-", "");
        file.delete();
        file.mkdirs();
        file.deleteOnExit();
        fileTransformer.setPath(file.getAbsolutePath());
        final Crawler crawler1 = container.getComponent("crawler");
        crawler1.setSessionId(crawler1.getSessionId() + "1");
        crawler1.setBackground(true);
        ((UrlFilterImpl) crawler1.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
        crawler1.addUrl(url1);
        crawler1.getCrawlerContext().setMaxAccessCount(maxCount);
        crawler1.getCrawlerContext().setNumOfThread(numOfThread);
        final Crawler crawler2 = container.getComponent("crawler");
        crawler2.setSessionId(crawler2.getSessionId() + "2");
        crawler2.setBackground(true);
        ((UrlFilterImpl) crawler2.urlFilter).setIncludeFilteringPattern("$1$2$3.*");
        crawler2.addUrl(url2);
        crawler2.getCrawlerContext().setMaxAccessCount(maxCount);
        crawler2.getCrawlerContext().setNumOfThread(numOfThread);
        final String sessionId1 = crawler1.execute();
        final String sessionId2 = crawler2.execute();
        assertNotSame(sessionId1, sessionId2);
        assertNotSame(crawler1.crawlerContext, crawler2.crawlerContext);
        Thread.sleep(1000);
        assertEquals(CrawlerStatus.RUNNING, crawler1.crawlerContext.getStatus());
        assertEquals(CrawlerStatus.RUNNING, crawler2.crawlerContext.getStatus());
        crawler1.awaitTermination();
        crawler2.awaitTermination();
        assertEquals(maxCount, dataService.getCount(sessionId1));
        assertEquals(maxCount, dataService.getCount(sessionId2));
        UrlQueue urlQueue;
        while ((urlQueue = urlQueueService.poll(sessionId1)) != null) {
            assertTrue(urlQueue.getUrl().startsWith(url1));
        }
        while ((urlQueue = urlQueueService.poll(sessionId2)) != null) {
            assertTrue(urlQueue.getUrl().startsWith(url2));
        }
        dataService.iterate(sessionId1, accessResult -> {
            assertTrue(accessResult.getUrl().startsWith(url1));
            assertEquals(Constants.GET_METHOD, accessResult.getMethod());
        });
        dataService.iterate(sessionId2, accessResult -> {
            assertTrue(accessResult.getUrl().startsWith(url2));
            assertEquals(Constants.GET_METHOD, accessResult.getMethod());
        });
        dataService.delete(sessionId1);
        dataService.delete(sessionId2);
    } finally {
        try {
            server1.stop();
        } finally {
            server2.stop();
        }
    }
}
Also used : UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) CrawlerWebServer(org.codelibs.fess.crawler.util.CrawlerWebServer) File(java.io.File)

Example 9 with UrlQueue

use of org.codelibs.fess.crawler.entity.UrlQueue in project fess by codelibs.

the class FessCrawlerThread method isContentUpdated.

@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
    if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
        final long startTime = System.currentTimeMillis();
        final FessConfig fessConfig = ComponentUtil.getFessConfig();
        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
        final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
        final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
        final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
        final String url = urlQueue.getUrl();
        ResponseData responseData = null;
        try {
            final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.put(fessConfig.getIndexFieldUrl(), url);
            final List<String> roleTypeList = new ArrayList<>();
            stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
            if (url.startsWith("smb:") || url.startsWith("smb1:") || url.startsWith("file:") || url.startsWith("ftp:")) {
                if (url.endsWith("/")) {
                    // directory
                    return true;
                }
                final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
                if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
                    // head method
                    responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                    if (responseData == null) {
                        return true;
                    }
                    roleTypeList.addAll(permissionHelper.getSmbRoleTypeList(responseData));
                    roleTypeList.addAll(permissionHelper.getFileRoleTypeList(responseData));
                    roleTypeList.addAll(permissionHelper.getFtpRoleTypeList(responseData));
                }
            }
            dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
            final String id = crawlingInfoHelper.generateId(dataMap);
            if (logger.isDebugEnabled()) {
                logger.debug("Searching indexed document: {}", id);
            }
            final Map<String, Object> document = indexingHelper.getDocument(searchEngineClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
            if (document == null) {
                storeChildUrlsToQueue(urlQueue, getChildUrlSet(searchEngineClient, id));
                return true;
            }
            final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
            if (expires != null && expires.getTime() < System.currentTimeMillis()) {
                final Object idValue = document.get(fessConfig.getIndexFieldId());
                if (idValue != null && !indexingHelper.deleteDocument(searchEngineClient, idValue.toString())) {
                    logger.debug("Failed to delete expired document: {}", url);
                }
                return true;
            }
            final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
            if (lastModified == null) {
                return true;
            }
            urlQueue.setLastModified(lastModified.getTime());
            log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
            if (responseData == null) {
                // head method
                responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                if (responseData == null) {
                    return true;
                }
            }
            final int httpStatusCode = responseData.getHttpStatusCode();
            if (logger.isDebugEnabled()) {
                logger.debug("Accessing document: {}, status: {}", url, httpStatusCode);
            }
            if (httpStatusCode == 404) {
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                if (!indexingHelper.deleteDocument(searchEngineClient, id)) {
                    logger.debug("Failed to delete 404 document: {}", url);
                }
                return false;
            }
            if (responseData.getLastModified() == null) {
                return true;
            }
            if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
                log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
                responseData.setExecutionTime(System.currentTimeMillis() - startTime);
                responseData.setParentUrl(urlQueue.getParentUrl());
                responseData.setSessionId(crawlerContext.getSessionId());
                responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
                processResponse(urlQueue, responseData);
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
                if (documentExpires != null && !indexingHelper.updateDocument(searchEngineClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
                    logger.debug("Failed to update {} at {}", fessConfig.getIndexFieldExpires(), url);
                }
                return false;
            }
        } finally {
            if (responseData != null) {
                CloseableUtil.closeQuietly(responseData);
            }
        }
    }
    return true;
}
Also used : DocumentUtil(org.codelibs.fess.util.DocumentUtil) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) Date(java.util.Date) HashMap(java.util.HashMap) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) PermissionHelper(org.codelibs.fess.helper.PermissionHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) LinkedHashSet(java.util.LinkedHashSet) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) LogType(org.codelibs.fess.crawler.log.LogType) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) StringUtil(org.codelibs.core.lang.StringUtil) Set(java.util.Set) ContentNotFoundException(org.codelibs.fess.exception.ContentNotFoundException) DuplicateHostHelper(org.codelibs.fess.helper.DuplicateHostHelper) Collectors(java.util.stream.Collectors) CloseableUtil(org.codelibs.core.io.CloseableUtil) List(java.util.List) Logger(org.apache.logging.log4j.Logger) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) LogManager(org.apache.logging.log4j.LogManager) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) HashMap(java.util.HashMap) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) ArrayList(java.util.ArrayList) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) PermissionHelper(org.codelibs.fess.helper.PermissionHelper)

Aggregations

UrlQueue (org.codelibs.fess.crawler.entity.UrlQueue)9 ArrayList (java.util.ArrayList)5 List (java.util.List)4 Map (java.util.Map)4 File (java.io.File)3 Date (java.util.Date)3 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)3 Set (java.util.Set)3 StringUtil (org.codelibs.core.lang.StringUtil)3 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)3 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)3 UrlFilterImpl (org.codelibs.fess.crawler.filter.impl.UrlFilterImpl)3 CrawlerWebServer (org.codelibs.fess.crawler.util.CrawlerWebServer)3 Collectors (java.util.stream.Collectors)2 LogManager (org.apache.logging.log4j.LogManager)2 Logger (org.apache.logging.log4j.Logger)2 SerializeUtil (org.codelibs.core.io.SerializeUtil)2 Constants (org.codelibs.fess.Constants)2 ResultData (org.codelibs.fess.crawler.entity.ResultData)2