Search in sources :

Example 6 with CrawlerClient

use of org.codelibs.fess.crawler.client.CrawlerClient in project fess-crawler by codelibs.

the class CrawlerThread method run.

/*
     * (non-Javadoc)
     *
     * @see java.lang.Runnable#run()
     */
@Override
public void run() {
    log(logHelper, LogType.START_THREAD, crawlerContext);
    int threadCheckCount = 0;
    // set urlQueue to thread
    CrawlingParameterUtil.setCrawlerContext(crawlerContext);
    CrawlingParameterUtil.setUrlQueueService(urlQueueService);
    CrawlingParameterUtil.setDataService(dataService);
    try {
        while (crawlerContext.getStatus() != CrawlerStatus.DONE && isContinue(threadCheckCount)) {
            final UrlQueue<?> urlQueue = urlQueueService.poll(crawlerContext.sessionId);
            if (isValid(urlQueue)) {
                ResponseData responseData = null;
                log(logHelper, LogType.START_CRAWLING, crawlerContext, urlQueue);
                try {
                    final CrawlerClient client = getClient(urlQueue.getUrl());
                    if (client == null) {
                        log(logHelper, LogType.UNSUPPORTED_URL_AT_CRAWLING_STARTED, crawlerContext, urlQueue);
                        continue;
                    }
                    startCrawling();
                    // set urlQueue to thread
                    CrawlingParameterUtil.setUrlQueue(urlQueue);
                    if (crawlerContext.intervalController != null) {
                        crawlerContext.intervalController.delay(IntervalController.PRE_PROCESSING);
                    }
                    final boolean contentUpdated = isContentUpdated(client, urlQueue);
                    if (contentUpdated) {
                        log(logHelper, LogType.GET_CONTENT, crawlerContext, urlQueue);
                        // access an url
                        final long startTime = SystemUtil.currentTimeMillis();
                        responseData = client.execute(RequestDataBuilder.newRequestData().method(urlQueue.getMethod()).url(urlQueue.getUrl()).build());
                        responseData.setExecutionTime(SystemUtil.currentTimeMillis() - startTime);
                        responseData.setParentUrl(urlQueue.getParentUrl());
                        responseData.setSessionId(crawlerContext.sessionId);
                        if (responseData.getRedirectLocation() == null) {
                            log(logHelper, LogType.PROCESS_RESPONSE, crawlerContext, urlQueue, responseData);
                            processResponse(urlQueue, responseData);
                        } else {
                            log(logHelper, LogType.REDIRECT_LOCATION, crawlerContext, urlQueue, responseData);
                            // redirect
                            storeChildUrl(responseData.getRedirectLocation(), urlQueue.getUrl(), null, urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
                        }
                    }
                    log(logHelper, LogType.FINISHED_CRAWLING, crawlerContext, urlQueue);
                } catch (final ChildUrlsException e) {
                    try {
                        final Set<RequestData> childUrlSet = e.getChildUrlList();
                        log(logHelper, LogType.PROCESS_CHILD_URLS_BY_EXCEPTION, crawlerContext, urlQueue, childUrlSet);
                        // add an url
                        storeChildUrls(childUrlSet, urlQueue.getUrl(), urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
                    } catch (final Exception e1) {
                        log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e1);
                    }
                    if (noWaitOnFolder) {
                        continue;
                    }
                } catch (final CrawlingAccessException e) {
                    log(logHelper, LogType.CRAWLING_ACCESS_EXCEPTION, crawlerContext, urlQueue, e);
                } catch (final Throwable e) {
                    log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e);
                } finally {
                    addSitemapsFromRobotsTxt(urlQueue);
                    if (responseData != null) {
                        CloseableUtil.closeQuietly(responseData);
                    }
                    if (crawlerContext.intervalController != null) {
                        crawlerContext.intervalController.delay(IntervalController.POST_PROCESSING);
                    }
                    // clear
                    threadCheckCount = 0;
                    // remove urlQueue from thread
                    CrawlingParameterUtil.setUrlQueue(null);
                    finishCrawling();
                }
            } else {
                log(logHelper, LogType.NO_URL_IN_QUEUE, crawlerContext, urlQueue, Integer.valueOf(threadCheckCount));
                if (crawlerContext.intervalController != null) {
                    crawlerContext.intervalController.delay(IntervalController.NO_URL_IN_QUEUE);
                }
                threadCheckCount++;
            }
            // interval
            if (crawlerContext.intervalController != null) {
                crawlerContext.intervalController.delay(IntervalController.WAIT_NEW_URL);
            }
        }
    } catch (final Throwable t) {
        log(logHelper, LogType.SYSTEM_ERROR, t);
    } finally {
        // remove crawlerContext from thread
        CrawlingParameterUtil.setCrawlerContext(null);
        CrawlingParameterUtil.setUrlQueueService(null);
        CrawlingParameterUtil.setDataService(null);
    }
    log(logHelper, LogType.FINISHED_THREAD, crawlerContext);
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) HashSet(java.util.HashSet) Set(java.util.Set) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException)

Example 7 with CrawlerClient

use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.

the class ViewHelper method asContentResponse.

public StreamResponse asContentResponse(final Map<String, Object> doc) {
    if (logger.isDebugEnabled()) {
        logger.debug("writing the content of: {}", doc);
    }
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final String configId = DocumentUtil.getValue(doc, fessConfig.getIndexFieldConfigId(), String.class);
    if (configId == null) {
        throw new FessSystemException("configId is null.");
    }
    if (configId.length() < 2) {
        throw new FessSystemException("Invalid configId: " + configId);
    }
    final CrawlingConfig config = crawlingConfigHelper.getCrawlingConfig(configId);
    if (config == null) {
        throw new FessSystemException("No crawlingConfig: " + configId);
    }
    final String url = DocumentUtil.getValue(doc, fessConfig.getIndexFieldUrl(), String.class);
    final CrawlerClientFactory crawlerClientFactory = config.initializeClientFactory(() -> ComponentUtil.getComponent(CrawlerClientFactory.class));
    final CrawlerClient client = crawlerClientFactory.getClient(url);
    if (client == null) {
        throw new FessSystemException("No CrawlerClient: " + configId + ", url: " + url);
    }
    return writeContent(configId, url, client);
}
Also used : CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) FessSystemException(org.codelibs.fess.exception.FessSystemException)

Example 8 with CrawlerClient

use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.

the class DocumentHelper method processRequest.

public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
    if (StringUtil.isBlank(crawlingInfoId)) {
        throw new CrawlingAccessException("sessionId is null.");
    }
    final CrawlerClientFactory crawlerClientFactory = crawlingConfig.initializeClientFactory(ComponentUtil::getCrawlerClientFactory);
    final CrawlerClient client = crawlerClientFactory.getClient(url);
    if (client == null) {
        throw new CrawlingAccessException("CrawlerClient is null for " + url);
    }
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            final Set<RequestData> childUrlList = new HashSet<>();
            childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
            throw new ChildUrlsException(childUrlList, this.getClass().getName() + "#RedirectedFrom:" + url);
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        responseData.setSessionId(crawlingInfoId);
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            throw new CrawlingAccessException("No url rule for " + url);
        }
        responseData.setRuleId(rule.getRuleId());
        final ResponseProcessor responseProcessor = rule.getResponseProcessor();
        if (!(responseProcessor instanceof DefaultResponseProcessor)) {
            throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
        }
        final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
        final ResultData resultData = transformer.transform(responseData);
        final byte[] data = resultData.getData();
        if (data != null) {
            try {
                return (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
            } catch (final Exception e) {
                throw new CrawlerSystemException("Could not create an instance from bytes.", e);
            }
        }
        return null;
    } catch (final Exception e) {
        throw new CrawlingAccessException("Failed to parse " + url, e);
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ComponentNotFoundException(org.lastaflute.di.core.exception.ComponentNotFoundException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) RequestData(org.codelibs.fess.crawler.entity.RequestData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map) HashSet(java.util.HashSet)

Example 9 with CrawlerClient

use of org.codelibs.fess.crawler.client.CrawlerClient in project fess by codelibs.

the class FessCrawlerThread method isContentUpdated.

@Override
protected boolean isContentUpdated(final CrawlerClient client, final UrlQueue<?> urlQueue) {
    if (ComponentUtil.getFessConfig().isIncrementalCrawling()) {
        final long startTime = System.currentTimeMillis();
        final FessConfig fessConfig = ComponentUtil.getFessConfig();
        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
        final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
        final IndexingHelper indexingHelper = ComponentUtil.getIndexingHelper();
        final SearchEngineClient searchEngineClient = ComponentUtil.getSearchEngineClient();
        final String url = urlQueue.getUrl();
        ResponseData responseData = null;
        try {
            final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(crawlerContext.getSessionId());
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.put(fessConfig.getIndexFieldUrl(), url);
            final List<String> roleTypeList = new ArrayList<>();
            stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
            if (url.startsWith("smb:") || url.startsWith("smb1:") || url.startsWith("file:") || url.startsWith("ftp:")) {
                if (url.endsWith("/")) {
                    // directory
                    return true;
                }
                final PermissionHelper permissionHelper = ComponentUtil.getPermissionHelper();
                if (fessConfig.isSmbRoleFromFile() || fessConfig.isFileRoleFromFile() || fessConfig.isFtpRoleFromFile()) {
                    // head method
                    responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                    if (responseData == null) {
                        return true;
                    }
                    roleTypeList.addAll(permissionHelper.getSmbRoleTypeList(responseData));
                    roleTypeList.addAll(permissionHelper.getFileRoleTypeList(responseData));
                    roleTypeList.addAll(permissionHelper.getFtpRoleTypeList(responseData));
                }
            }
            dataMap.put(fessConfig.getIndexFieldRole(), roleTypeList);
            final String id = crawlingInfoHelper.generateId(dataMap);
            if (logger.isDebugEnabled()) {
                logger.debug("Searching indexed document: {}", id);
            }
            final Map<String, Object> document = indexingHelper.getDocument(searchEngineClient, id, new String[] { fessConfig.getIndexFieldId(), fessConfig.getIndexFieldLastModified(), fessConfig.getIndexFieldAnchor(), fessConfig.getIndexFieldSegment(), fessConfig.getIndexFieldExpires(), fessConfig.getIndexFieldClickCount(), fessConfig.getIndexFieldFavoriteCount() });
            if (document == null) {
                storeChildUrlsToQueue(urlQueue, getChildUrlSet(searchEngineClient, id));
                return true;
            }
            final Date expires = DocumentUtil.getValue(document, fessConfig.getIndexFieldExpires(), Date.class);
            if (expires != null && expires.getTime() < System.currentTimeMillis()) {
                final Object idValue = document.get(fessConfig.getIndexFieldId());
                if (idValue != null && !indexingHelper.deleteDocument(searchEngineClient, idValue.toString())) {
                    logger.debug("Failed to delete expired document: {}", url);
                }
                return true;
            }
            final Date lastModified = DocumentUtil.getValue(document, fessConfig.getIndexFieldLastModified(), Date.class);
            if (lastModified == null) {
                return true;
            }
            urlQueue.setLastModified(lastModified.getTime());
            log(logHelper, LogType.CHECK_LAST_MODIFIED, crawlerContext, urlQueue);
            if (responseData == null) {
                // head method
                responseData = client.execute(RequestDataBuilder.newRequestData().head().url(url).build());
                if (responseData == null) {
                    return true;
                }
            }
            final int httpStatusCode = responseData.getHttpStatusCode();
            if (logger.isDebugEnabled()) {
                logger.debug("Accessing document: {}, status: {}", url, httpStatusCode);
            }
            if (httpStatusCode == 404) {
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                if (!indexingHelper.deleteDocument(searchEngineClient, id)) {
                    logger.debug("Failed to delete 404 document: {}", url);
                }
                return false;
            }
            if (responseData.getLastModified() == null) {
                return true;
            }
            if (responseData.getLastModified().getTime() <= lastModified.getTime() && httpStatusCode == 200) {
                log(logHelper, LogType.NOT_MODIFIED, crawlerContext, urlQueue);
                responseData.setExecutionTime(System.currentTimeMillis() - startTime);
                responseData.setParentUrl(urlQueue.getParentUrl());
                responseData.setSessionId(crawlerContext.getSessionId());
                responseData.setHttpStatusCode(org.codelibs.fess.crawler.Constants.NOT_MODIFIED_STATUS);
                processResponse(urlQueue, responseData);
                storeChildUrlsToQueue(urlQueue, getAnchorSet(document.get(fessConfig.getIndexFieldAnchor())));
                final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
                if (documentExpires != null && !indexingHelper.updateDocument(searchEngineClient, id, fessConfig.getIndexFieldExpires(), documentExpires)) {
                    logger.debug("Failed to update {} at {}", fessConfig.getIndexFieldExpires(), url);
                }
                return false;
            }
        } finally {
            if (responseData != null) {
                CloseableUtil.closeQuietly(responseData);
            }
        }
    }
    return true;
}
Also used : DocumentUtil(org.codelibs.fess.util.DocumentUtil) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) Date(java.util.Date) HashMap(java.util.HashMap) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) PermissionHelper(org.codelibs.fess.helper.PermissionHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) LinkedHashSet(java.util.LinkedHashSet) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) LogType(org.codelibs.fess.crawler.log.LogType) ContainerNotAvailableException(org.codelibs.fess.exception.ContainerNotAvailableException) StringUtil(org.codelibs.core.lang.StringUtil) Set(java.util.Set) ContentNotFoundException(org.codelibs.fess.exception.ContentNotFoundException) DuplicateHostHelper(org.codelibs.fess.helper.DuplicateHostHelper) Collectors(java.util.stream.Collectors) CloseableUtil(org.codelibs.core.io.CloseableUtil) List(java.util.List) Logger(org.apache.logging.log4j.Logger) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) LogManager(org.apache.logging.log4j.LogManager) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) HashMap(java.util.HashMap) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) ArrayList(java.util.ArrayList) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Date(java.util.Date) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) PermissionHelper(org.codelibs.fess.helper.PermissionHelper)

Aggregations

CrawlerClient (org.codelibs.fess.crawler.client.CrawlerClient)9 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)6 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)6 Map (java.util.Map)5 CrawlerClientFactory (org.codelibs.fess.crawler.client.CrawlerClientFactory)5 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)5 RequestData (org.codelibs.fess.crawler.entity.RequestData)4 ComponentUtil (org.codelibs.fess.util.ComponentUtil)4 ArrayList (java.util.ArrayList)3 HashSet (java.util.HashSet)3 List (java.util.List)3 Collectors (java.util.stream.Collectors)3 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)3 RequestDataBuilder (org.codelibs.fess.crawler.builder.RequestDataBuilder)3 ResultData (org.codelibs.fess.crawler.entity.ResultData)3 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)3 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)3 ResponseProcessor (org.codelibs.fess.crawler.processor.ResponseProcessor)3 DefaultResponseProcessor (org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor)3 Rule (org.codelibs.fess.crawler.rule.Rule)3