use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.
the class AbstractFessFileTransformer method transform.
@Override
public ResultData transform(final ResponseData responseData) {
if (responseData == null || !responseData.hasResponseBody()) {
throw new CrawlingAccessException("No response body.");
}
final ResultData resultData = new ResultData();
resultData.setTransformerName(getName());
try {
resultData.setData(SerializeUtil.fromObjectToBinary(generateData(responseData)));
} catch (final Exception e) {
throw new CrawlingAccessException("Could not serialize object", e);
}
resultData.setEncoding(fessConfig.getCrawlerCrawlingDataEncoding());
return resultData;
}
use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.
the class FileListIndexUpdateCallbackImpl method processRequest.
protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
return responseData.getRedirectLocation();
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
if (dataMap.containsKey(Constants.SESSION_ID)) {
responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
} else {
responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
}
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
logger.warn("No url rule. Data: " + dataMap);
} else {
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (responseProcessor instanceof DefaultResponseProcessor) {
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
@SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
dataMap.putAll(responseDataMap);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
// remove
String[] ignoreFields;
if (paramMap.containsKey("ignore.field.names")) {
ignoreFields = paramMap.get("ignore.field.names").split(",");
} else {
ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
}
stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
indexUpdateCallback.store(paramMap, dataMap);
} else {
logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", Data: " + dataMap);
}
}
return null;
} catch (final ChildUrlsException e) {
throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
} catch (final Exception e) {
throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
}
}
use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.
the class FessXpathTransformerTest method test_processMetaRobots_noindex.
public void test_processMetaRobots_noindex() throws Exception {
final String data = "<meta name=\"robots\" content=\"noindex\" /><a href=\"index.html\">aaa</a>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
responseData.setResponseBody(data.getBytes());
try {
transformer.processMetaRobots(responseData, new ResultData(), document);
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.
the class FessXpathTransformerTest method test_processMetaRobots_noindexnofollow.
public void test_processMetaRobots_noindexnofollow() throws Exception {
final String data = "<meta name=\"ROBOTS\" content=\"NOINDEX,NOFOLLOW\" />";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
try {
transformer.processMetaRobots(responseData, new ResultData(), document);
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
use of org.codelibs.fess.crawler.entity.ResultData in project fess by codelibs.
the class FessXpathTransformerTest method test_processMetaRobots_no.
public void test_processMetaRobots_no() throws Exception {
final String data = "<html><body>foo</body></html>";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer();
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
transformer.processMetaRobots(responseData, new ResultData(), document);
assertFalse(responseData.isNoFollow());
}
Aggregations