Search in sources :

Example 1 with CrawlerClient

use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.

the class FileListIndexUpdateCallbackImpl method processRequest.

protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            return responseData.getRedirectLocation();
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        if (dataMap.containsKey(Constants.SESSION_ID)) {
            responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
        } else {
            responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
        }
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            logger.warn("No url rule. Data: " + dataMap);
        } else {
            responseData.setRuleId(rule.getRuleId());
            final ResponseProcessor responseProcessor = rule.getResponseProcessor();
            if (responseProcessor instanceof DefaultResponseProcessor) {
                final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
                final ResultData resultData = transformer.transform(responseData);
                final byte[] data = resultData.getData();
                if (data != null) {
                    try {
                        @SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
                        dataMap.putAll(responseDataMap);
                    } catch (final Exception e) {
                        throw new CrawlerSystemException("Could not create an instance from bytes.", e);
                    }
                }
                // remove
                String[] ignoreFields;
                if (paramMap.containsKey("ignore.field.names")) {
                    ignoreFields = paramMap.get("ignore.field.names").split(",");
                } else {
                    ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
                }
                stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
                indexUpdateCallback.store(paramMap, dataMap);
            } else {
                logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", Data: " + dataMap);
            }
        }
        return null;
    } catch (final ChildUrlsException e) {
        throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
    } catch (final Exception e) {
        throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
    }
}
Also used : Constants(org.codelibs.fess.Constants) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) LoggerFactory(org.slf4j.LoggerFactory) SerializeUtil(org.codelibs.core.io.SerializeUtil) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) IndexUpdateCallback(org.codelibs.fess.ds.IndexUpdateCallback) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) ExecutorService(java.util.concurrent.ExecutorService) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) ResultData(org.codelibs.fess.crawler.entity.ResultData) FessEsClient(org.codelibs.fess.es.client.FessEsClient) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) ComponentUtil(org.codelibs.fess.util.ComponentUtil) SingletonLaContainer(org.lastaflute.di.core.SingletonLaContainer) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map)

Example 2 with CrawlerClient

use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.

the class FileListIndexUpdateCallbackImpl method addDocument.

protected void addDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    synchronized (indexUpdateCallback) {
        // required check
        if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) {
            logger.warn("Could not add a doc. Invalid data: " + dataMap);
            return;
        }
        final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
        final CrawlerClient client = crawlerClientFactory.getClient(url);
        if (client == null) {
            logger.warn("CrawlerClient is null. Data: " + dataMap);
            return;
        }
        String processingUrl = url;
        for (int i = 0; i < maxRedirectCount; i++) {
            processingUrl = processRequest(paramMap, dataMap, processingUrl, client);
            if (processingUrl == null) {
                break;
            }
            dataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
        }
    }
}
Also used : CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig)

Example 3 with CrawlerClient

use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.

the class BaseThumbnailGenerator method process.

protected boolean process(final String id, final Predicate<ResponseData> consumer) {
    return process(id, (configId, url) -> {
        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
        final CrawlingConfig config = crawlingConfigHelper.getCrawlingConfig(configId);
        if (config == null) {
            throw new ThumbnailGenerationException("No CrawlingConfig: " + configId);
        }
        if (logger.isInfoEnabled()) {
            logger.info("Generating Thumbnail: {}", url);
        }
        final CrawlerClientFactory crawlerClientFactory = config.initializeClientFactory(() -> ComponentUtil.getComponent(CrawlerClientFactory.class));
        final CrawlerClient client = crawlerClientFactory.getClient(url);
        if (client == null) {
            throw new ThumbnailGenerationException("No CrawlerClient: " + configId + ", url: " + url);
        }
        String u = url;
        for (int i = 0; i < maxRedirectCount; i++) {
            try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(u).build())) {
                if (StringUtil.isNotBlank(responseData.getRedirectLocation())) {
                    u = responseData.getRedirectLocation();
                    continue;
                }
                if (StringUtil.isBlank(responseData.getUrl())) {
                    throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url + " (Response URL is empty)");
                }
                return consumer.test(responseData);
            } catch (final CrawlingAccessException e) {
                if (logger.isDebugEnabled()) {
                    throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url, e);
                }
                throw new ThumbnailGenerationException(e.getMessage());
            } catch (final Exception e) {
                throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url, e);
            }
        }
        throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url + " (Redirect Loop)");
    });
}
Also used : CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ThumbnailGenerationException(org.codelibs.fess.exception.ThumbnailGenerationException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) ThumbnailGenerationException(org.codelibs.fess.exception.ThumbnailGenerationException)

Example 4 with CrawlerClient

use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.

the class FileListIndexUpdateCallbackImpl method addDocument.

protected void addDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    synchronized (indexUpdateCallback) {
        // required check
        if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) {
            logger.warn("Could not add a doc. Invalid data: {}", dataMap);
            return;
        }
        final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
        final CrawlerClient client = crawlerClientFactory.getClient(url);
        if (client == null) {
            logger.warn("CrawlerClient is null. Data: {}", dataMap);
            return;
        }
        final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
        long counter = 0;
        final Deque<String> urlQueue = new LinkedList<>();
        urlQueue.offer(url);
        while (!urlQueue.isEmpty() && (maxAccessCount < 0 || counter < maxAccessCount)) {
            final Map<String, Object> localDataMap = dataMap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
            String processingUrl = urlQueue.poll();
            if (deleteUrlList.contains(processingUrl)) {
                // delete before indexing
                deleteDocuments();
            }
            try {
                for (int i = 0; i < maxRedirectCount; i++) {
                    processingUrl = processRequest(paramMap, localDataMap, processingUrl, client);
                    if (processingUrl == null) {
                        break;
                    }
                    counter++;
                    localDataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
                }
            } catch (final ChildUrlsException e) {
                e.getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
            } catch (final DataStoreCrawlingException e) {
                final Throwable cause = e.getCause();
                if (cause instanceof ChildUrlsException) {
                    ((ChildUrlsException) cause).getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
                } else if (maxAccessCount != 1L) {
                    throw e;
                } else {
                    logger.warn("Failed to access {}.", processingUrl, e);
                }
            }
        }
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) LinkedList(java.util.LinkedList) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) Map(java.util.Map)

Example 5 with CrawlerClient

use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.

the class FileListIndexUpdateCallbackImpl method processRequest.

protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            return responseData.getRedirectLocation();
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        if (dataMap.containsKey(Constants.SESSION_ID)) {
            responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
        } else {
            responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
        }
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            logger.warn("No url rule. Data: {}", dataMap);
        } else {
            responseData.setRuleId(rule.getRuleId());
            final ResponseProcessor responseProcessor = rule.getResponseProcessor();
            if (responseProcessor instanceof DefaultResponseProcessor) {
                final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
                final ResultData resultData = transformer.transform(responseData);
                final byte[] data = resultData.getData();
                if (data != null) {
                    try {
                        @SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
                        dataMap.putAll(responseDataMap);
                    } catch (final Exception e) {
                        throw new CrawlerSystemException("Could not create an instance from bytes.", e);
                    }
                }
                // remove
                String[] ignoreFields;
                if (paramMap.containsKey("ignore.field.names")) {
                    ignoreFields = paramMap.get("ignore.field.names").split(",");
                } else {
                    ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
                }
                stream(ignoreFields).of(stream -> stream.map(String::trim).forEach(s -> dataMap.remove(s)));
                indexUpdateCallback.store(paramMap, dataMap);
            } else {
                logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}", responseProcessor, dataMap);
            }
        }
        return null;
    } catch (final ChildUrlsException e) {
        throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(RequestData::getUrl).collect(Collectors.joining(", ")), e);
    } catch (final Exception e) {
        throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
    }
}
Also used : Constants(org.codelibs.fess.Constants) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) SerializeUtil(org.codelibs.core.io.SerializeUtil) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) Deque(java.util.Deque) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) LinkedList(java.util.LinkedList) ExecutorService(java.util.concurrent.ExecutorService) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) QueryBuilders(org.opensearch.index.query.QueryBuilders) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) ResultData(org.codelibs.fess.crawler.entity.ResultData) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Logger(org.apache.logging.log4j.Logger) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) SingletonLaContainer(org.lastaflute.di.core.SingletonLaContainer) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) LogManager(org.apache.logging.log4j.LogManager) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map)

Aggregations

CrawlerClient (org.codelibs.fess.crawler.client.CrawlerClient)9 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)6 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)6 Map (java.util.Map)5 CrawlerClientFactory (org.codelibs.fess.crawler.client.CrawlerClientFactory)5 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)5 RequestData (org.codelibs.fess.crawler.entity.RequestData)4 ComponentUtil (org.codelibs.fess.util.ComponentUtil)4 ArrayList (java.util.ArrayList)3 HashSet (java.util.HashSet)3 List (java.util.List)3 Collectors (java.util.stream.Collectors)3 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)3 RequestDataBuilder (org.codelibs.fess.crawler.builder.RequestDataBuilder)3 ResultData (org.codelibs.fess.crawler.entity.ResultData)3 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)3 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)3 ResponseProcessor (org.codelibs.fess.crawler.processor.ResponseProcessor)3 DefaultResponseProcessor (org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor)3 Rule (org.codelibs.fess.crawler.rule.Rule)3