use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class WebDriverClient method execute.
@Override
public ResponseData execute(final RequestData request) {
WebDriver webDriver = null;
try {
webDriver = webDriverPool.borrowObject();
Map<String, String> paramMap = null;
final String url = request.getUrl();
final String metaData = request.getMetaData();
if (StringUtil.isNotBlank(metaData)) {
paramMap = parseParamMap(metaData);
}
if (!url.equals(webDriver.getCurrentUrl())) {
webDriver.get(url);
}
if (logger.isDebugEnabled()) {
logger.debug("Base URL: " + url + "\nContent: " + webDriver.getPageSource());
}
if (paramMap != null) {
final String processorName = paramMap.get(UrlAction.URL_ACTION);
final UrlAction urlAction = urlActionMap.get(processorName);
if (urlAction == null) {
throw new CrawlerSystemException("Unknown processor: " + processorName);
}
urlAction.navigate(webDriver, paramMap);
}
final String source = webDriver.getPageSource();
final ResponseData responseData = new ResponseData();
responseData.setUrl(webDriver.getCurrentUrl());
responseData.setMethod(request.getMethod().name());
responseData.setContentLength(source.length());
final String charSet = getCharSet(webDriver);
responseData.setCharSet(charSet);
responseData.setHttpStatusCode(getStatusCode(webDriver));
responseData.setLastModified(getLastModified(webDriver));
responseData.setMimeType(getContentType(webDriver));
responseData.setResponseBody(source.getBytes(charSet));
for (final UrlAction urlAction : urlActionMap.values()) {
urlAction.collect(url, webDriver, responseData);
}
return responseData;
} catch (final Exception e) {
throw new CrawlerSystemException("Failed to access " + request.getUrl(), e);
} finally {
if (webDriver != null) {
try {
webDriverPool.returnObject(webDriver);
} catch (final Exception e) {
logger.warn("Failed to return a returned object.", e);
}
}
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class ZipExtractor method getText.
@Override
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
final MimeTypeHelper mimeTypeHelper = getMimeTypeHelper();
final ExtractorFactory extractorFactory = getExtractorFactory();
final StringBuilder buf = new StringBuilder(1000);
try (final ArchiveInputStream ais = archiveStreamFactory.createArchiveInputStream(in.markSupported() ? in : new BufferedInputStream(in))) {
ZipArchiveEntry entry = null;
long contentSize = 0;
while ((entry = (ZipArchiveEntry) ais.getNextEntry()) != null) {
contentSize += entry.getSize();
if (maxContentSize != -1 && contentSize > maxContentSize) {
throw new MaxLengthExceededException("Extracted size is " + contentSize + " > " + maxContentSize);
}
final String filename = entry.getName();
final String mimeType = mimeTypeHelper.getContentType(null, filename);
if (mimeType != null) {
final Extractor extractor = extractorFactory.getExtractor(mimeType);
if (extractor != null) {
try {
final Map<String, String> map = new HashMap<>();
map.put(TikaMetadataKeys.RESOURCE_NAME_KEY, filename);
buf.append(extractor.getText(new IgnoreCloseInputStream(ais), map).getContent());
buf.append('\n');
} catch (final Exception e) {
if (logger.isDebugEnabled()) {
logger.debug("Exception in an internal extractor.", e);
}
}
}
}
}
} catch (final MaxLengthExceededException e) {
throw e;
} catch (final Exception e) {
if (buf.length() == 0) {
throw new ExtractException("Could not extract a content.", e);
}
}
return new ExtractData(buf.toString().trim());
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class AbstractXmlExtractor method getText.
public ExtractData getText(final InputStream in, final Map<String, String> params) {
if (in == null) {
throw new CrawlerSystemException("The inputstream is null.");
}
try {
final BufferedInputStream bis = new BufferedInputStream(in);
final String enc = getEncoding(bis);
final String content = UNESCAPE_HTML4.translate(new String(InputStreamUtil.getBytes(bis), enc));
return new ExtractData(extractString(content));
} catch (final Exception e) {
throw new ExtractException(e);
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: {}", dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(String::trim).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}", responseProcessor, dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(RequestData::getUrl).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class EsClient method createTransportClient.
protected TransportClient createTransportClient() {
final Settings settings = Settings.builder().put("cluster.name", StringUtil.isBlank(clusterName) ? "elasticsearch" : clusterName).build();
final TransportClient transportClient = new PreBuiltTransportClient(settings);
Arrays.stream(addresses).forEach(address -> {
final String[] values = address.split(":");
String hostname;
int port = 9300;
if (values.length == 1) {
hostname = values[0];
} else if (values.length == 2) {
hostname = values[0];
port = Integer.parseInt(values[1]);
} else {
throw new CrawlerSystemException("Invalid address: " + address);
}
try {
transportClient.addTransportAddress(new TransportAddress(InetAddress.getByName(hostname), port));
} catch (final Exception e) {
throw new CrawlerSystemException("Unknown host: " + address);
}
logger.info("Connected to " + hostname + ":" + port);
});
return transportClient;
}
Aggregations