use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: " + dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", Data: " + dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class FileTransformer method storeData.
@Override
public void storeData(final ResponseData responseData, final ResultData resultData) {
resultData.setTransformerName(getName());
initBaseDir();
final String url = responseData.getUrl();
final String path = getFilePath(url);
synchronized (this) {
final File file = createFile(path);
try (final InputStream is = responseData.getResponseBody();
final OutputStream os = new FileOutputStream(file)) {
CopyUtil.copy(is, os);
} catch (final IOException e) {
throw new CrawlerSystemException("Could not store " + file.getAbsolutePath(), e);
}
}
try {
resultData.setData(path.getBytes(charsetName));
} catch (final UnsupportedEncodingException e) {
if (logger.isInfoEnabled()) {
logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
}
charsetName = Constants.UTF_8_CHARSET.name();
resultData.setData(path.getBytes(Constants.UTF_8_CHARSET));
}
resultData.setEncoding(charsetName);
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class FileTransformer method createFile.
protected File createFile(final String path) {
final String[] paths = path.split("/");
File targetFile = baseDir;
for (int i = 0; i < paths.length - 1; i++) {
File file = new File(targetFile, paths[i]);
if (file.exists()) {
if (!file.isDirectory()) {
for (int j = 0; j < maxDuplicatedPath; j++) {
file = new File(targetFile, paths[i] + "_" + j);
if (file.exists()) {
if (file.isDirectory()) {
break;
}
} else {
if (!file.mkdirs()) {
throw new CrawlerSystemException("Could not create " + file.getAbsolutePath());
}
break;
}
}
}
} else {
if (!file.mkdirs()) {
throw new CrawlerSystemException("Could not create " + file.getAbsolutePath());
}
}
targetFile = file;
}
File file = new File(targetFile, paths[paths.length - 1]);
if (file.exists()) {
for (int i = 0; i < maxDuplicatedPath; i++) {
file = new File(targetFile, paths[paths.length - 1] + "_" + i);
if (!file.exists()) {
targetFile = file;
break;
}
}
} else {
targetFile = file;
}
return targetFile;
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class HtmlTransformer method storeChildUrls.
protected void storeChildUrls(final ResponseData responseData, final ResultData resultData) {
List<RequestData> requestDataList = new ArrayList<>();
try (final InputStream is = responseData.getResponseBody()) {
final DOMParser parser = getDomParser();
parser.parse(new InputSource(is));
final Document document = parser.getDocument();
// base href
final String baseHref = getBaseHref(document);
URL url;
try {
url = new URL(baseHref == null ? responseData.getUrl() : baseHref);
} catch (final MalformedURLException e) {
url = new URL(responseData.getUrl());
}
for (final Map.Entry<String, String> entry : childUrlRuleMap.entrySet()) {
for (final String childUrl : getUrlFromTagAttribute(url, document, entry.getKey(), entry.getValue(), responseData.getCharSet())) {
requestDataList.add(RequestDataBuilder.newRequestData().get().url(childUrl).build());
}
}
requestDataList = convertChildUrlList(requestDataList);
resultData.addAllUrl(requestDataList);
resultData.addAllUrl(responseData.getChildUrlSet());
final RequestData requestData = responseData.getRequestData();
resultData.removeUrl(requestData);
resultData.removeUrl(getDuplicateUrl(requestData));
} catch (final CrawlerSystemException e) {
throw e;
} catch (final Exception e) {
throw new CrawlerSystemException("Could not store data.", e);
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class TextTransformer method transform.
/*
* (non-Javadoc)
*
* @see
* org.codelibs.fess.crawler.transformer.impl.AbstractTransformer#transform(org.fess.crawler.entity.ResponseData)
*/
@Override
public ResultData transform(final ResponseData responseData) {
if (responseData == null || !responseData.hasResponseBody()) {
throw new CrawlingAccessException("No response body.");
}
final ExtractorFactory extractorFactory = crawlerContainer.getComponent("extractorFactory");
if (extractorFactory == null) {
throw new CrawlerSystemException("Could not find extractorFactory.");
}
final Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
final Map<String, String> params = new HashMap<>();
params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType());
String content = null;
try (final InputStream in = responseData.getResponseBody()) {
content = extractor.getText(in, params).getContent();
} catch (final Exception e) {
throw new CrawlingAccessException("Could not extract data.", e);
}
final ResultData resultData = new ResultData();
resultData.setTransformerName(getName());
try {
resultData.setData(content.getBytes(charsetName));
} catch (final UnsupportedEncodingException e) {
if (logger.isInfoEnabled()) {
logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
}
charsetName = Constants.UTF_8_CHARSET.name();
resultData.setData(content.getBytes(Constants.UTF_8_CHARSET));
}
resultData.setEncoding(charsetName);
return resultData;
}
Aggregations