use of org.codelibs.fess.exception.DataStoreCrawlingException in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: " + dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", Data: " + dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.exception.DataStoreCrawlingException in project fess by codelibs.
the class CsvDataStoreImpl method processCsv.
protected void processCsv(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final CsvConfig csvConfig, final File csvFile, final long readInterval, final String csvFileEncoding, final boolean hasHeaderLine) {
logger.info("Loading " + csvFile.getAbsolutePath());
CsvReader csvReader = null;
try {
csvReader = new CsvReader(new BufferedReader(new InputStreamReader(new FileInputStream(csvFile), csvFileEncoding)), csvConfig);
List<String> headerList = null;
if (hasHeaderLine) {
headerList = csvReader.readValues();
}
List<String> list;
boolean loop = true;
while ((list = csvReader.readValues()) != null && loop && alive) {
final Map<String, Object> dataMap = new HashMap<>();
dataMap.putAll(defaultDataMap);
final Map<String, Object> resultMap = new LinkedHashMap<>();
resultMap.putAll(paramMap);
resultMap.put("csvfile", csvFile.getAbsolutePath());
resultMap.put("csvfilename", csvFile.getName());
resultMap.put("crawlingConfig", dataConfig);
boolean foundValues = false;
for (int i = 0; i < list.size(); i++) {
String key = null;
String value = list.get(i);
if (value == null) {
value = StringUtil.EMPTY;
}
if (StringUtil.isNotBlank(value)) {
foundValues = true;
}
if (headerList != null && headerList.size() > i) {
key = headerList.get(i);
if (StringUtil.isNotBlank(key)) {
resultMap.put(key, value);
}
}
key = CELL_PREFIX + Integer.toString(i + 1);
resultMap.put(key, value);
}
if (!foundValues) {
logger.debug("No data in line: {}", resultMap);
continue;
}
if (logger.isDebugEnabled()) {
for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {
logger.debug(entry.getKey() + "=" + entry.getValue());
}
}
final Map<String, Object> crawlingContext = new HashMap<>();
crawlingContext.put("doc", dataMap);
resultMap.put("crawlingContext", crawlingContext);
for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
final Object convertValue = convertValue(entry.getValue(), resultMap);
if (convertValue != null) {
dataMap.put(entry.getKey(), convertValue);
}
}
if (logger.isDebugEnabled()) {
for (final Map.Entry<String, Object> entry : dataMap.entrySet()) {
logger.debug(entry.getKey() + "=" + entry.getValue());
}
}
try {
callback.store(paramMap, dataMap);
} catch (final CrawlingAccessException e) {
logger.warn("Crawling Access Exception at : " + dataMap, e);
Throwable target = e;
if (target instanceof MultipleCrawlingAccessException) {
final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
if (causes.length > 0) {
target = causes[causes.length - 1];
}
}
String errorName;
final Throwable cause = target.getCause();
if (cause != null) {
errorName = cause.getClass().getCanonicalName();
} else {
errorName = target.getClass().getCanonicalName();
}
String url;
if (target instanceof DataStoreCrawlingException) {
final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
url = dce.getUrl();
if (dce.aborted()) {
loop = false;
}
} else {
url = csvFile.getAbsolutePath() + ":" + csvReader.getLineNumber();
}
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(dataConfig, errorName, url, target);
} catch (final Throwable t) {
logger.warn("Crawling Access Exception at : " + dataMap, t);
final String url = csvFile.getAbsolutePath() + ":" + csvReader.getLineNumber();
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
}
if (readInterval > 0) {
sleep(readInterval);
}
}
} catch (final Exception e) {
throw new DataStoreException("Failed to crawl data when reading csv file.", e);
} finally {
IOUtils.closeQuietly(csvReader);
}
}
use of org.codelibs.fess.exception.DataStoreCrawlingException in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method addDocument.
protected void addDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
final FessConfig fessConfig = ComponentUtil.getFessConfig();
synchronized (indexUpdateCallback) {
// required check
if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) {
logger.warn("Could not add a doc. Invalid data: {}", dataMap);
return;
}
final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
logger.warn("CrawlerClient is null. Data: {}", dataMap);
return;
}
final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
long counter = 0;
final Deque<String> urlQueue = new LinkedList<>();
urlQueue.offer(url);
while (!urlQueue.isEmpty() && (maxAccessCount < 0 || counter < maxAccessCount)) {
final Map<String, Object> localDataMap = dataMap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
String processingUrl = urlQueue.poll();
if (deleteUrlList.contains(processingUrl)) {
// delete before indexing
deleteDocuments();
}
try {
for (int i = 0; i < maxRedirectCount; i++) {
processingUrl = processRequest(paramMap, localDataMap, processingUrl, client);
if (processingUrl == null) {
break;
}
counter++;
localDataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
}
} catch (final ChildUrlsException e) {
e.getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
} catch (final DataStoreCrawlingException e) {
final Throwable cause = e.getCause();
if (cause instanceof ChildUrlsException) {
((ChildUrlsException) cause).getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
} else if (maxAccessCount != 1L) {
throw e;
} else {
logger.warn("Failed to access {}.", processingUrl, e);
}
}
}
}
}
use of org.codelibs.fess.exception.DataStoreCrawlingException in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: {}", dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(String::trim).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}", responseProcessor, dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(RequestData::getUrl).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.exception.DataStoreCrawlingException in project fess by codelibs.
the class DatabaseDataStoreImpl method storeData.
@Override
protected void storeData(final DataConfig config, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
final long readInterval = getReadInterval(paramMap);
Connection con = null;
Statement stmt = null;
ResultSet rs = null;
try {
Class.forName(getDriverClass(paramMap));
final String jdbcUrl = getUrl(paramMap);
final String username = getUsername(paramMap);
final String password = getPassword(paramMap);
if (StringUtil.isNotEmpty(username)) {
con = DriverManager.getConnection(jdbcUrl, username, password);
} else {
con = DriverManager.getConnection(jdbcUrl);
}
final String sql = getSql(paramMap);
stmt = con.createStatement();
// SQL generated by an administrator
rs = stmt.executeQuery(sql);
boolean loop = true;
while (rs.next() && loop && alive) {
final Map<String, Object> dataMap = new HashMap<>();
dataMap.putAll(defaultDataMap);
final Map<String, Object> crawlingContext = new HashMap<>();
crawlingContext.put("doc", dataMap);
for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
final Object convertValue = convertValue(entry.getValue(), new ResultSetParamMap(config, crawlingContext, rs, paramMap));
if (convertValue != null) {
dataMap.put(entry.getKey(), convertValue);
}
}
try {
callback.store(paramMap, dataMap);
} catch (final CrawlingAccessException e) {
logger.warn("Crawling Access Exception at : " + dataMap, e);
Throwable target = e;
if (target instanceof MultipleCrawlingAccessException) {
final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
if (causes.length > 0) {
target = causes[causes.length - 1];
}
}
String errorName;
final Throwable cause = target.getCause();
if (cause != null) {
errorName = cause.getClass().getCanonicalName();
} else {
errorName = target.getClass().getCanonicalName();
}
String url;
if (target instanceof DataStoreCrawlingException) {
final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
url = dce.getUrl();
if (dce.aborted()) {
loop = false;
}
} else {
url = sql + ":" + rs.getRow();
}
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(config, errorName, url, target);
} catch (final Throwable t) {
logger.warn("Crawling Access Exception at : " + dataMap, t);
final String url = sql + ":" + rs.getRow();
final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
failureUrlService.store(config, t.getClass().getCanonicalName(), url, t);
}
if (readInterval > 0) {
sleep(readInterval);
}
}
} catch (final Exception e) {
throw new DataStoreException("Failed to crawl data in DB.", e);
} finally {
try {
if (rs != null) {
rs.close();
}
} catch (final SQLException e) {
logger.warn("Failed to close a result set.", e);
} finally {
try {
if (stmt != null) {
stmt.close();
}
} catch (final SQLException e) {
logger.warn("Failed to close a statement.", e);
} finally {
try {
if (con != null) {
con.close();
}
} catch (final SQLException e) {
logger.warn("Failed to close a db connection.", e);
}
}
}
}
}
Aggregations