Search in sources :

Example 1 with DefaultResponseProcessor

use of org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor in project fess by codelibs.

the class FileListIndexUpdateCallbackImpl method processRequest.

protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            return responseData.getRedirectLocation();
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        if (dataMap.containsKey(Constants.SESSION_ID)) {
            responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
        } else {
            responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
        }
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            logger.warn("No url rule. Data: " + dataMap);
        } else {
            responseData.setRuleId(rule.getRuleId());
            final ResponseProcessor responseProcessor = rule.getResponseProcessor();
            if (responseProcessor instanceof DefaultResponseProcessor) {
                final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
                final ResultData resultData = transformer.transform(responseData);
                final byte[] data = resultData.getData();
                if (data != null) {
                    try {
                        @SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
                        dataMap.putAll(responseDataMap);
                    } catch (final Exception e) {
                        throw new CrawlerSystemException("Could not create an instance from bytes.", e);
                    }
                }
                // remove
                String[] ignoreFields;
                if (paramMap.containsKey("ignore.field.names")) {
                    ignoreFields = paramMap.get("ignore.field.names").split(",");
                } else {
                    ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
                }
                stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
                indexUpdateCallback.store(paramMap, dataMap);
            } else {
                logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", Data: " + dataMap);
            }
        }
        return null;
    } catch (final ChildUrlsException e) {
        throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
    } catch (final Exception e) {
        throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
    }
}
Also used : Constants(org.codelibs.fess.Constants) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) LoggerFactory(org.slf4j.LoggerFactory) SerializeUtil(org.codelibs.core.io.SerializeUtil) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) IndexUpdateCallback(org.codelibs.fess.ds.IndexUpdateCallback) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) ExecutorService(java.util.concurrent.ExecutorService) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) ResultData(org.codelibs.fess.crawler.entity.ResultData) FessEsClient(org.codelibs.fess.es.client.FessEsClient) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) ComponentUtil(org.codelibs.fess.util.ComponentUtil) SingletonLaContainer(org.lastaflute.di.core.SingletonLaContainer) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map)

Example 2 with DefaultResponseProcessor

use of org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor in project fess by codelibs.

the class FileListIndexUpdateCallbackImpl method processRequest.

protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            return responseData.getRedirectLocation();
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        if (dataMap.containsKey(Constants.SESSION_ID)) {
            responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
        } else {
            responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
        }
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            logger.warn("No url rule. Data: {}", dataMap);
        } else {
            responseData.setRuleId(rule.getRuleId());
            final ResponseProcessor responseProcessor = rule.getResponseProcessor();
            if (responseProcessor instanceof DefaultResponseProcessor) {
                final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
                final ResultData resultData = transformer.transform(responseData);
                final byte[] data = resultData.getData();
                if (data != null) {
                    try {
                        @SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
                        dataMap.putAll(responseDataMap);
                    } catch (final Exception e) {
                        throw new CrawlerSystemException("Could not create an instance from bytes.", e);
                    }
                }
                // remove
                String[] ignoreFields;
                if (paramMap.containsKey("ignore.field.names")) {
                    ignoreFields = paramMap.get("ignore.field.names").split(",");
                } else {
                    ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
                }
                stream(ignoreFields).of(stream -> stream.map(String::trim).forEach(s -> dataMap.remove(s)));
                indexUpdateCallback.store(paramMap, dataMap);
            } else {
                logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}", responseProcessor, dataMap);
            }
        }
        return null;
    } catch (final ChildUrlsException e) {
        throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(RequestData::getUrl).collect(Collectors.joining(", ")), e);
    } catch (final Exception e) {
        throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
    }
}
Also used : Constants(org.codelibs.fess.Constants) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) SerializeUtil(org.codelibs.core.io.SerializeUtil) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) Deque(java.util.Deque) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) LinkedList(java.util.LinkedList) ExecutorService(java.util.concurrent.ExecutorService) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) QueryBuilders(org.opensearch.index.query.QueryBuilders) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) ResultData(org.codelibs.fess.crawler.entity.ResultData) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Logger(org.apache.logging.log4j.Logger) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) SingletonLaContainer(org.lastaflute.di.core.SingletonLaContainer) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) LogManager(org.apache.logging.log4j.LogManager) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map)

Example 3 with DefaultResponseProcessor

use of org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor in project fess-crawler by codelibs.

the class CrawlerTest method setUp.

@Override
protected void setUp() throws Exception {
    super.setUp();
    final Map<String, String> featureMap = newHashMap();
    featureMap.put("http://xml.org/sax/features/namespaces", "false");
    final Map<String, String> propertyMap = newHashMap();
    final Map<String, String> childUrlRuleMap = newHashMap();
    childUrlRuleMap.put("//A", "href");
    childUrlRuleMap.put("//AREA", "href");
    childUrlRuleMap.put("//FRAME", "src");
    childUrlRuleMap.put("//IFRAME", "src");
    childUrlRuleMap.put("//IMG", "src");
    childUrlRuleMap.put("//LINK", "href");
    childUrlRuleMap.put("//SCRIPT", "src");
    container = new StandardCrawlerContainer();
    container.<HcHttpClient>prototype("internalHttpClient", HcHttpClient.class, client -> {
        client.setCookieSpec(CookieSpecs.BEST_MATCH);
        client.setClientConnectionManager(container.getComponent("clientConnectionManager"));
    }).prototype("httpClient", FaultTolerantClient.class, client -> {
        client.setCrawlerClient(container.getComponent("internalHttpClient"));
        client.setMaxRetryCount(5);
        client.setRetryInterval(500);
    }).prototype("fsClient", FileSystemClient.class).prototype("ruleManager", RuleManagerImpl.class, manager -> {
        manager.addRule(container.getComponent("sitemapsRule"));
        manager.addRule(container.getComponent("fileRule"));
    }).prototype("accessResult", AccessResultImpl.class).prototype("urlQueue", UrlQueueImpl.class).prototype("crawlerThread", CrawlerThread.class).prototype("crawler", Crawler.class).prototype("urlFilterService", UrlFilterServiceImpl.class).prototype("urlQueueService", UrlQueueServiceImpl.class).prototype("dataService", DataServiceImpl.class).prototype("urlFilter", UrlFilterImpl.class).singleton("urlConvertHelper", UrlConvertHelper.class).singleton("intervalController", DefaultIntervalController.class).singleton("sitemapsHelper", SitemapsHelper.class).singleton("logHelper", LogHelperImpl.class).singleton("encodingHelper", EncodingHelper.class).singleton("contentLengthHelper", ContentLengthHelper.class).singleton("mimeTypeHelper", MimeTypeHelperImpl.class).<FileTransformer>singleton("fileTransformer", FileTransformer.class, transformer -> {
        transformer.setName("fileTransformer");
        transformer.setFeatureMap(featureMap);
        transformer.setPropertyMap(propertyMap);
        transformer.setChildUrlRuleMap(childUrlRuleMap);
    }).singleton("dataHelper", MemoryDataHelper.class).singleton("robotsTxtHelper", RobotsTxtHelper.class).<CrawlerClientFactory>singleton("clientFactory", CrawlerClientFactory.class, factory -> {
        factory.addClient("http:.*", container.getComponent("httpClient"));
        factory.addClient("file:.*", container.getComponent("fsClient"));
    }).singleton("tikaExtractor", TikaExtractor.class).<ExtractorFactory>singleton("extractorFactory", ExtractorFactory.class, factory -> {
        TikaExtractor tikaExtractor = container.getComponent("tikaExtractor");
        factory.addExtractor("text/plain", tikaExtractor);
        factory.addExtractor("text/html", tikaExtractor);
    }).singleton("httpClient", // 
    HcHttpClient.class).singleton("sitemapsResponseProcessor", // 
    SitemapsResponseProcessor.class).<SitemapsRule>singleton("sitemapsRule", SitemapsRule.class, rule -> {
        rule.setResponseProcessor(container.getComponent("sitemapsResponseProcessor"));
        rule.setRuleId("sitemapsRule");
        rule.addRule("url", ".*sitemap.*");
    }).<// 
    DefaultResponseProcessor>singleton("defaultResponseProcessor", DefaultResponseProcessor.class, processor -> {
        processor.setTransformer(container.getComponent("fileTransformer"));
        processor.setSuccessfulHttpCodes(new int[] { 200 });
        processor.setNotModifiedHttpCodes(new int[] { 304 });
    }).<// 
    RegexRule>singleton("fileRule", RegexRule.class, rule -> {
        rule.setRuleId("fileRule");
        rule.setDefaultRule(true);
        rule.setResponseProcessor(container.getComponent("defaultResponseProcessor"));
    }).<// 
    PoolingHttpClientConnectionManager>singleton("clientConnectionManager", new PoolingHttpClientConnectionManager(5, TimeUnit.MINUTES), manager -> {
        manager.setMaxTotal(200);
        manager.setDefaultMaxPerRoute(20);
    });
    crawler = container.getComponent("crawler");
    dataService = container.getComponent("dataService");
    urlQueueService = container.getComponent("urlQueueService");
    fileTransformer = container.getComponent("fileTransformer");
}
Also used : StandardCrawlerContainer(org.codelibs.fess.crawler.container.StandardCrawlerContainer) MimeTypeHelperImpl(org.codelibs.fess.crawler.helper.impl.MimeTypeHelperImpl) HcHttpClient(org.codelibs.fess.crawler.client.http.HcHttpClient) UrlQueueImpl(org.codelibs.fess.crawler.entity.UrlQueueImpl) PlainTestCase(org.dbflute.utflute.core.PlainTestCase) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) FileSystemClient(org.codelibs.fess.crawler.client.fs.FileSystemClient) DataService(org.codelibs.fess.crawler.service.DataService) CookieSpecs(org.apache.http.client.config.CookieSpecs) SitemapsHelper(org.codelibs.fess.crawler.helper.SitemapsHelper) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) RuleManagerImpl(org.codelibs.fess.crawler.rule.impl.RuleManagerImpl) DataServiceImpl(org.codelibs.fess.crawler.service.impl.DataServiceImpl) RegexRule(org.codelibs.fess.crawler.rule.impl.RegexRule) FaultTolerantClient(org.codelibs.fess.crawler.client.FaultTolerantClient) LogHelperImpl(org.codelibs.fess.crawler.helper.impl.LogHelperImpl) SitemapsRule(org.codelibs.fess.crawler.rule.impl.SitemapsRule) Map(java.util.Map) UrlFilterServiceImpl(org.codelibs.fess.crawler.service.impl.UrlFilterServiceImpl) PoolingHttpClientConnectionManager(org.apache.http.impl.conn.PoolingHttpClientConnectionManager) CrawlerWebServer(org.codelibs.fess.crawler.util.CrawlerWebServer) ContentLengthHelper(org.codelibs.fess.crawler.helper.ContentLengthHelper) EncodingHelper(org.codelibs.fess.crawler.helper.EncodingHelper) RobotsTxtHelper(org.codelibs.fess.crawler.helper.RobotsTxtHelper) AccessResultImpl(org.codelibs.fess.crawler.entity.AccessResultImpl) SitemapsResponseProcessor(org.codelibs.fess.crawler.processor.impl.SitemapsResponseProcessor) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) MemoryDataHelper(org.codelibs.fess.crawler.helper.MemoryDataHelper) DefaultIntervalController(org.codelibs.fess.crawler.interval.impl.DefaultIntervalController) ResourceUtil(org.codelibs.core.io.ResourceUtil) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor) File(java.io.File) FileTransformer(org.codelibs.fess.crawler.transformer.impl.FileTransformer) TimeUnit(java.util.concurrent.TimeUnit) UrlQueueService(org.codelibs.fess.crawler.service.UrlQueueService) UrlConvertHelper(org.codelibs.fess.crawler.helper.UrlConvertHelper) UrlQueueServiceImpl(org.codelibs.fess.crawler.service.impl.UrlQueueServiceImpl) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) LogHelperImpl(org.codelibs.fess.crawler.helper.impl.LogHelperImpl) ContentLengthHelper(org.codelibs.fess.crawler.helper.ContentLengthHelper) HcHttpClient(org.codelibs.fess.crawler.client.http.HcHttpClient) DefaultIntervalController(org.codelibs.fess.crawler.interval.impl.DefaultIntervalController) RobotsTxtHelper(org.codelibs.fess.crawler.helper.RobotsTxtHelper) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) StandardCrawlerContainer(org.codelibs.fess.crawler.container.StandardCrawlerContainer) UrlQueueImpl(org.codelibs.fess.crawler.entity.UrlQueueImpl) RuleManagerImpl(org.codelibs.fess.crawler.rule.impl.RuleManagerImpl) PoolingHttpClientConnectionManager(org.apache.http.impl.conn.PoolingHttpClientConnectionManager) FaultTolerantClient(org.codelibs.fess.crawler.client.FaultTolerantClient) FileTransformer(org.codelibs.fess.crawler.transformer.impl.FileTransformer) UrlFilterImpl(org.codelibs.fess.crawler.filter.impl.UrlFilterImpl) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) SitemapsRule(org.codelibs.fess.crawler.rule.impl.SitemapsRule) TikaExtractor(org.codelibs.fess.crawler.extractor.impl.TikaExtractor) RegexRule(org.codelibs.fess.crawler.rule.impl.RegexRule) UrlQueueServiceImpl(org.codelibs.fess.crawler.service.impl.UrlQueueServiceImpl)

Example 4 with DefaultResponseProcessor

use of org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor in project fess by codelibs.

the class DocumentHelper method processRequest.

public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
    if (StringUtil.isBlank(crawlingInfoId)) {
        throw new CrawlingAccessException("sessionId is null.");
    }
    final CrawlerClientFactory crawlerClientFactory = crawlingConfig.initializeClientFactory(ComponentUtil::getCrawlerClientFactory);
    final CrawlerClient client = crawlerClientFactory.getClient(url);
    if (client == null) {
        throw new CrawlingAccessException("CrawlerClient is null for " + url);
    }
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            final Set<RequestData> childUrlList = new HashSet<>();
            childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
            throw new ChildUrlsException(childUrlList, this.getClass().getName() + "#RedirectedFrom:" + url);
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        responseData.setSessionId(crawlingInfoId);
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            throw new CrawlingAccessException("No url rule for " + url);
        }
        responseData.setRuleId(rule.getRuleId());
        final ResponseProcessor responseProcessor = rule.getResponseProcessor();
        if (!(responseProcessor instanceof DefaultResponseProcessor)) {
            throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
        }
        final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
        final ResultData resultData = transformer.transform(responseData);
        final byte[] data = resultData.getData();
        if (data != null) {
            try {
                return (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
            } catch (final Exception e) {
                throw new CrawlerSystemException("Could not create an instance from bytes.", e);
            }
        }
        return null;
    } catch (final Exception e) {
        throw new CrawlingAccessException("Failed to parse " + url, e);
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ComponentNotFoundException(org.lastaflute.di.core.exception.ComponentNotFoundException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) RequestData(org.codelibs.fess.crawler.entity.RequestData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map) HashSet(java.util.HashSet)

Aggregations

Map (java.util.Map)4 CrawlerClientFactory (org.codelibs.fess.crawler.client.CrawlerClientFactory)4 DefaultResponseProcessor (org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor)4 TimeUnit (java.util.concurrent.TimeUnit)3 CrawlerClient (org.codelibs.fess.crawler.client.CrawlerClient)3 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)3 ResultData (org.codelibs.fess.crawler.entity.ResultData)3 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)3 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)3 ResponseProcessor (org.codelibs.fess.crawler.processor.ResponseProcessor)3 Rule (org.codelibs.fess.crawler.rule.Rule)3 RuleManager (org.codelibs.fess.crawler.rule.RuleManager)3 ArrayList (java.util.ArrayList)2 List (java.util.List)2 ExecutorService (java.util.concurrent.ExecutorService)2 LinkedBlockingQueue (java.util.concurrent.LinkedBlockingQueue)2 ThreadPoolExecutor (java.util.concurrent.ThreadPoolExecutor)2 Collectors (java.util.stream.Collectors)2 SerializeUtil (org.codelibs.core.io.SerializeUtil)2 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)2