use of org.codelibs.fess.crawler.client.CrawlerClientFactory in project fess by codelibs.
the class CsvListDataStoreImpl method storeData.
@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
int nThreads = 1;
if (paramMap.containsKey(Constants.NUM_OF_THREADS)) {
try {
nThreads = Integer.parseInt(paramMap.get(Constants.NUM_OF_THREADS));
} catch (final NumberFormatException e) {
logger.warn(Constants.NUM_OF_THREADS + " is not int value.", e);
}
}
final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
dataConfig.initializeClientFactory(crawlerClientFactory);
try {
final FileListIndexUpdateCallbackImpl fileListIndexUpdateCallback = new FileListIndexUpdateCallbackImpl(callback, crawlerClientFactory, nThreads);
super.storeData(dataConfig, fileListIndexUpdateCallback, paramMap, scriptMap, defaultDataMap);
fileListIndexUpdateCallback.commit();
} catch (final Exception e) {
throw new DataStoreException(e);
}
}
use of org.codelibs.fess.crawler.client.CrawlerClientFactory in project fess by codelibs.
the class EsListDataStoreImpl method storeData.
@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
int nThreads = 1;
if (paramMap.containsKey(Constants.NUM_OF_THREADS)) {
try {
nThreads = Integer.parseInt(paramMap.get(Constants.NUM_OF_THREADS));
} catch (final NumberFormatException e) {
logger.warn(Constants.NUM_OF_THREADS + " is not int value.", e);
}
}
final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
dataConfig.initializeClientFactory(crawlerClientFactory);
try {
final FileListIndexUpdateCallbackImpl fileListIndexUpdateCallback = new FileListIndexUpdateCallbackImpl(callback, crawlerClientFactory, nThreads);
super.storeData(dataConfig, fileListIndexUpdateCallback, paramMap, scriptMap, defaultDataMap);
fileListIndexUpdateCallback.commit();
} catch (final Exception e) {
throw new DataStoreException(e);
}
}
use of org.codelibs.fess.crawler.client.CrawlerClientFactory in project fess by codelibs.
the class GitBucketDataStoreImpl method storeData.
@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
final String rootURL = getRootURL(paramMap);
final String authToken = getAuthToken(paramMap);
final long readInterval = getReadInterval(paramMap);
// Non-emptiness Check for URL and Token
if (rootURL.isEmpty() || authToken.isEmpty()) {
logger.warn("parameter \"" + TOKEN_PARAM + "\" and \"" + GITBUCKET_URL_PARAM + "\" are required");
return;
}
// Get List of Repositories
final List<Map<String, Object>> repositoryList = getRepositoryList(rootURL, authToken);
if (repositoryList.isEmpty()) {
logger.warn("Token is invalid or no Repository");
return;
}
// Get Labels
final Map<String, String> pluginInfo = getFessPluginInfo(rootURL, authToken);
final String sourceLabel = pluginInfo.get("source_label");
final String issueLabel = pluginInfo.get("issue_label");
final String wikiLabel = pluginInfo.get("wiki_label");
final CrawlingConfig crawlingConfig = new CrawlingConfigWrapper(dataConfig) {
@Override
public Map<String, Object> initializeClientFactory(final CrawlerClientFactory crawlerClientFactory) {
final Map<String, Object> paramMap = super.initializeClientFactory(crawlerClientFactory);
final List<RequestHeader> headerList = new ArrayList<>();
final RequestHeader[] headers = (RequestHeader[]) paramMap.get(HcHttpClient.REQUERT_HEADERS_PROPERTY);
if (headers != null) {
for (final RequestHeader header : headers) {
headerList.add(header);
}
}
headerList.add(new RequestHeader("Authorization", "token " + authToken));
headerList.add(new RequestHeader("Accept", "application/vnd.github.v3.raw"));
paramMap.put(HcHttpClient.REQUERT_HEADERS_PROPERTY, headerList.toArray(new RequestHeader[headerList.size()]));
return paramMap;
}
};
// Crawl each repository
for (final Map<String, Object> repository : repositoryList) {
try {
final String owner = (String) repository.get("owner");
final String name = (String) repository.get("name");
final String refStr = getGitRef(rootURL, authToken, owner, name, "master");
final int issueCount = (int) repository.get("issue_count");
final int pullCount = (int) repository.get("pull_count");
final List<String> roleList = createRoleList(owner, repository);
logger.info("Crawl " + owner + "/" + name);
// crawl and store file contents recursively
crawlFileContents(rootURL, authToken, owner, name, refStr, StringUtil.EMPTY, 0, readInterval, path -> {
storeFileContent(rootURL, authToken, sourceLabel, owner, name, refStr, roleList, path, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap);
if (readInterval > 0) {
sleep(readInterval);
}
});
logger.info("Crawl issues in " + owner + "/" + name);
// store issues
for (int issueId = 1; issueId <= issueCount + pullCount; issueId++) {
storeIssueById(rootURL, authToken, issueLabel, owner, name, new Integer(issueId), roleList, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap);
if (readInterval > 0) {
sleep(readInterval);
}
}
logger.info("Crawl Wiki in " + owner + "/" + name);
// crawl Wiki
storeWikiContents(rootURL, authToken, wikiLabel, owner, name, roleList, crawlingConfig, callback, paramMap, scriptMap, defaultDataMap, readInterval);
} catch (final Exception e) {
logger.warn("Failed to access to " + repository, e);
}
}
}
use of org.codelibs.fess.crawler.client.CrawlerClientFactory in project fess by codelibs.
the class ViewHelper method asContentResponse.
public StreamResponse asContentResponse(final Map<String, Object> doc) {
if (logger.isDebugEnabled()) {
logger.debug("writing the content of: " + doc);
}
final FessConfig fessConfig = ComponentUtil.getFessConfig();
final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
final String configId = DocumentUtil.getValue(doc, fessConfig.getIndexFieldConfigId(), String.class);
if (configId == null) {
throw new FessSystemException("configId is null.");
}
if (configId.length() < 2) {
throw new FessSystemException("Invalid configId: " + configId);
}
final ConfigType configType = crawlingConfigHelper.getConfigType(configId);
CrawlingConfig config = null;
if (logger.isDebugEnabled()) {
logger.debug("configType: " + configType + ", configId: " + configId);
}
if (ConfigType.WEB == configType) {
final WebConfigService webConfigService = ComponentUtil.getComponent(WebConfigService.class);
config = webConfigService.getWebConfig(crawlingConfigHelper.getId(configId)).get();
} else if (ConfigType.FILE == configType) {
final FileConfigService fileConfigService = ComponentUtil.getComponent(FileConfigService.class);
config = fileConfigService.getFileConfig(crawlingConfigHelper.getId(configId)).get();
} else if (ConfigType.DATA == configType) {
final DataConfigService dataConfigService = ComponentUtil.getComponent(DataConfigService.class);
config = dataConfigService.getDataConfig(crawlingConfigHelper.getId(configId)).get();
}
if (config == null) {
throw new FessSystemException("No crawlingConfig: " + configId);
}
final String url = DocumentUtil.getValue(doc, fessConfig.getIndexFieldUrl(), String.class);
final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getComponent(CrawlerClientFactory.class);
config.initializeClientFactory(crawlerClientFactory);
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new FessSystemException("No CrawlerClient: " + configId + ", url: " + url);
}
return writeContent(configId, url, client);
}
use of org.codelibs.fess.crawler.client.CrawlerClientFactory in project fess by codelibs.
the class DocumentHelper method processRequest.
public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
if (StringUtil.isBlank(crawlingInfoId)) {
throw new CrawlingAccessException("sessionId is null.");
}
final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
crawlingConfig.initializeClientFactory(crawlerClientFactory);
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new CrawlingAccessException("CrawlerClient is null for " + url);
}
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
final Set<RequestData> childUrlList = new HashSet<>();
childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
throw new ChildUrlsException(childUrlList, "Redirected from " + url);
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setSessionId(crawlingInfoId);
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
throw new CrawlingAccessException("No url rule for " + url);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> result = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
return result;
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
} else {
throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
}
}
return null;
} catch (final Exception e) {
throw new CrawlingAccessException("Failed to parse " + url, e);
}
}
Aggregations