Search in sources :

Example 1 with CrawlerClientFactory

use of org.codelibs.fess.crawler.client.CrawlerClientFactory in project fess by codelibs.

the class CsvListDataStoreImpl method storeData.

@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
    int nThreads = 1;
    if (paramMap.containsKey(Constants.NUM_OF_THREADS)) {
        try {
            nThreads = Integer.parseInt(paramMap.get(Constants.NUM_OF_THREADS));
        } catch (final NumberFormatException e) {
            logger.warn(Constants.NUM_OF_THREADS + " is not int value.", e);
        }
    }
    final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
    dataConfig.initializeClientFactory(crawlerClientFactory);
    try {
        final FileListIndexUpdateCallbackImpl fileListIndexUpdateCallback = new FileListIndexUpdateCallbackImpl(callback, crawlerClientFactory, nThreads);
        super.storeData(dataConfig, fileListIndexUpdateCallback, paramMap, scriptMap, defaultDataMap);
        fileListIndexUpdateCallback.commit();
    } catch (final Exception e) {
        throw new DataStoreException(e);
    }
}
Also used : DataStoreException(org.codelibs.fess.exception.DataStoreException) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) DataStoreException(org.codelibs.fess.exception.DataStoreException)

Example 2 with CrawlerClientFactory

use of org.codelibs.fess.crawler.client.CrawlerClientFactory in project fess by codelibs.

the class EsListDataStoreImpl method storeData.

@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
    int nThreads = 1;
    if (paramMap.containsKey(Constants.NUM_OF_THREADS)) {
        try {
            nThreads = Integer.parseInt(paramMap.get(Constants.NUM_OF_THREADS));
        } catch (final NumberFormatException e) {
            logger.warn(Constants.NUM_OF_THREADS + " is not int value.", e);
        }
    }
    final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
    dataConfig.initializeClientFactory(crawlerClientFactory);
    try {
        final FileListIndexUpdateCallbackImpl fileListIndexUpdateCallback = new FileListIndexUpdateCallbackImpl(callback, crawlerClientFactory, nThreads);
        super.storeData(dataConfig, fileListIndexUpdateCallback, paramMap, scriptMap, defaultDataMap);
        fileListIndexUpdateCallback.commit();
    } catch (final Exception e) {
        throw new DataStoreException(e);
    }
}
Also used : DataStoreException(org.codelibs.fess.exception.DataStoreException) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) DataStoreException(org.codelibs.fess.exception.DataStoreException)

Example 3 with CrawlerClientFactory

use of org.codelibs.fess.crawler.client.CrawlerClientFactory in project fess by codelibs.

the class GitBucketDataStoreImpl method storeData.

@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
    final String rootURL = getRootURL(paramMap);
    final String authToken = getAuthToken(paramMap);
    final long readInterval = getReadInterval(paramMap);
    // Non-emptiness Check for URL and Token
    if (rootURL.isEmpty() || authToken.isEmpty()) {
        logger.warn("parameter \"" + TOKEN_PARAM + "\" and \"" + GITBUCKET_URL_PARAM + "\" are required");
        return;
    }
    // Get List of Repositories
    final List<Map<String, Object>> repositoryList = getRepositoryList(rootURL, authToken);
    if (repositoryList.isEmpty()) {
        logger.warn("Token is invalid or no Repository");
        return;
    }
    // Get Labels
    final Map<String, String> pluginInfo = getFessPluginInfo(rootURL, authToken);
    final String sourceLabel = pluginInfo.get("source_label");
    final String issueLabel = pluginInfo.get("issue_label");
    final String wikiLabel = pluginInfo.get("wiki_label");
    final CrawlingConfig crawlingConfig = new CrawlingConfigWrapper(dataConfig) {

        @Override
        public Map<String, Object> initializeClientFactory(final CrawlerClientFactory crawlerClientFactory) {
            final Map<String, Object> paramMap = super.initializeClientFactory(crawlerClientFactory);
            final List<RequestHeader> headerList = new ArrayList<>();
            final RequestHeader[] headers = (RequestHeader[]) paramMap.get(HcHttpClient.REQUERT_HEADERS_PROPERTY);
            if (headers != null) {
                for (final RequestHeader header : headers) {
                    headerList.add(header);
                }
            }
            headerList.add(new RequestHeader("Authorization", "token " + authToken));
            headerList.add(new RequestHeader("Accept", "application/vnd.github.v3.raw"));
            paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY, headerList.toArray(new RequestHeader[headerList.size()]));
            return paramMap;
        }
    };
    // Crawl each repository
    for (final Map<String, Object> repository : repositoryList) {
        try {
            final String owner = (String) repository.get("owner");
            final String name = (String) repository.get("name");
            final String refStr = getGitRef(rootURL, authToken, owner, name, "master");
            final int issueCount = (int) repository.get("issue_count");
            final int pullCount = (int) repository.get("pull_count");
            final List<String> roleList = createRoleList(owner, repository);
            logger.info("Crawl " + owner + "/" + name);
            // crawl and store file contents recursively
            crawlFileContents(rootURL, authToken, owner, name, refStr, StringUtil.EMPTY, 0, readInterval, path -> {
                storeFileContent(rootURL, authToken, sourceLabel, owner, name, refStr, roleList, path, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap);
                if (readInterval > 0) {
                    sleep(readInterval);
                }
            });
            logger.info("Crawl issues in " + owner + "/" + name);
            // store issues
            for (int issueId = 1; issueId <= issueCount + pullCount; issueId++) {
                storeIssueById(rootURL, authToken, issueLabel, owner, name, new Integer(issueId), roleList, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap);
                if (readInterval > 0) {
                    sleep(readInterval);
                }
            }
            logger.info("Crawl Wiki in " + owner + "/" + name);
            // crawl Wiki
            storeWikiContents(rootURL, authToken, wikiLabel, owner, name, roleList, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap, readInterval);
        } catch (final Exception e) {
            logger.warn("Failed to access to " + repository, e);
        }
    }
}
Also used : CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) CrawlingConfigWrapper(org.codelibs.fess.es.config.exentity.CrawlingConfigWrapper) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ArrayList(java.util.ArrayList) URISyntaxException(java.net.URISyntaxException) RequestHeader(org.codelibs.fess.crawler.client.http.RequestHeader) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with CrawlerClientFactory

use of org.codelibs.fess.crawler.client.CrawlerClientFactory in project fess by codelibs.

the class ViewHelper method asContentResponse.

public StreamResponse asContentResponse(final Map<String, Object> doc) {
    if (logger.isDebugEnabled()) {
        logger.debug("writing the content of: " + doc);
    }
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final String configId = DocumentUtil.getValue(doc, fessConfig.getIndexFieldConfigId(), String.class);
    if (configId == null) {
        throw new FessSystemException("configId is null.");
    }
    if (configId.length() < 2) {
        throw new FessSystemException("Invalid configId: " + configId);
    }
    final ConfigType configType = crawlingConfigHelper.getConfigType(configId);
    CrawlingConfig config = null;
    if (logger.isDebugEnabled()) {
        logger.debug("configType: " + configType + ", configId: " + configId);
    }
    if (ConfigType.WEB == configType) {
        final WebConfigService webConfigService = ComponentUtil.getComponent(WebConfigService.class);
        config = webConfigService.getWebConfig(crawlingConfigHelper.getId(configId)).get();
    } else if (ConfigType.FILE == configType) {
        final FileConfigService fileConfigService = ComponentUtil.getComponent(FileConfigService.class);
        config = fileConfigService.getFileConfig(crawlingConfigHelper.getId(configId)).get();
    } else if (ConfigType.DATA == configType) {
        final DataConfigService dataConfigService = ComponentUtil.getComponent(DataConfigService.class);
        config = dataConfigService.getDataConfig(crawlingConfigHelper.getId(configId)).get();
    }
    if (config == null) {
        throw new FessSystemException("No crawlingConfig: " + configId);
    }
    final String url = DocumentUtil.getValue(doc, fessConfig.getIndexFieldUrl(), String.class);
    final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getComponent(CrawlerClientFactory.class);
    config.initializeClientFactory(crawlerClientFactory);
    final CrawlerClient client = crawlerClientFactory.getClient(url);
    if (client == null) {
        throw new FessSystemException("No CrawlerClient: " + configId + ", url: " + url);
    }
    return writeContent(configId, url, client);
}
Also used : DataConfigService(org.codelibs.fess.app.service.DataConfigService) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) WebConfigService(org.codelibs.fess.app.service.WebConfigService) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) ConfigType(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigType) FessSystemException(org.codelibs.fess.exception.FessSystemException) FileConfigService(org.codelibs.fess.app.service.FileConfigService)

Example 5 with CrawlerClientFactory

use of org.codelibs.fess.crawler.client.CrawlerClientFactory in project fess by codelibs.

the class DocumentHelper method processRequest.

public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
    if (StringUtil.isBlank(crawlingInfoId)) {
        throw new CrawlingAccessException("sessionId is null.");
    }
    final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
    crawlingConfig.initializeClientFactory(crawlerClientFactory);
    final CrawlerClient client = crawlerClientFactory.getClient(url);
    if (client == null) {
        throw new CrawlingAccessException("CrawlerClient is null for " + url);
    }
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            final Set<RequestData> childUrlList = new HashSet<>();
            childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
            throw new ChildUrlsException(childUrlList, "Redirected from " + url);
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        responseData.setSessionId(crawlingInfoId);
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            throw new CrawlingAccessException("No url rule for " + url);
        } else {
            responseData.setRuleId(rule.getRuleId());
            final ResponseProcessor responseProcessor = rule.getResponseProcessor();
            if (responseProcessor instanceof DefaultResponseProcessor) {
                final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
                final ResultData resultData = transformer.transform(responseData);
                final byte[] data = resultData.getData();
                if (data != null) {
                    try {
                        @SuppressWarnings("unchecked") final Map<String, Object> result = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
                        return result;
                    } catch (final Exception e) {
                        throw new CrawlerSystemException("Could not create an instance from bytes.", e);
                    }
                }
            } else {
                throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
            }
        }
        return null;
    } catch (final Exception e) {
        throw new CrawlingAccessException("Failed to parse " + url, e);
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) RequestData(org.codelibs.fess.crawler.entity.RequestData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map) HashSet(java.util.HashSet)

Aggregations

CrawlerClientFactory (org.codelibs.fess.crawler.client.CrawlerClientFactory)5 Map (java.util.Map)2 CrawlerClient (org.codelibs.fess.crawler.client.CrawlerClient)2 CrawlingConfig (org.codelibs.fess.es.config.exentity.CrawlingConfig)2 DataStoreException (org.codelibs.fess.exception.DataStoreException)2 IOException (java.io.IOException)1 URISyntaxException (java.net.URISyntaxException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 DataConfigService (org.codelibs.fess.app.service.DataConfigService)1 FileConfigService (org.codelibs.fess.app.service.FileConfigService)1 WebConfigService (org.codelibs.fess.app.service.WebConfigService)1 RequestHeader (org.codelibs.fess.crawler.client.http.RequestHeader)1 RequestData (org.codelibs.fess.crawler.entity.RequestData)1 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)1 ResultData (org.codelibs.fess.crawler.entity.ResultData)1 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)1 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)1 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)1