use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.
the class FtpClientTest method test_doGet_dir1.
public void test_doGet_dir1() throws FtpException {
FtpServer server = null;
try {
String username = "testuser";
String password = "testpass";
server = startFtpServer(FTP_PORT, username, password);
Map<String, Object> params = new HashMap<String, Object>();
FtpAuthentication auth = new FtpAuthentication();
auth.setUsername(username);
auth.setPassword(password);
params.put(FtpClient.FTP_AUTHENTICATIONS_PROPERTY, new FtpAuthentication[] { auth });
ftpClient.setInitParameterMap(params);
ftpClient.doGet("ftp://localhost:" + FTP_PORT + "/dir1");
fail();
} catch (final ChildUrlsException e) {
final Set<RequestData> urlSet = e.getChildUrlList();
assertEquals(1, urlSet.size());
for (final RequestData requestData : urlSet.toArray(new RequestData[urlSet.size()])) {
String url = requestData.getUrl();
assertTrue(url.contains("dir1/test3.txt"));
}
} finally {
if (server != null) {
server.stop();
}
}
}
use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess by codelibs.
the class DocumentHelper method processRequest.
public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
if (StringUtil.isBlank(crawlingInfoId)) {
throw new CrawlingAccessException("sessionId is null.");
}
final CrawlerClientFactory crawlerClientFactory = crawlingConfig.initializeClientFactory(ComponentUtil::getCrawlerClientFactory);
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new CrawlingAccessException("CrawlerClient is null for " + url);
}
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
final Set<RequestData> childUrlList = new HashSet<>();
childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
throw new ChildUrlsException(childUrlList, this.getClass().getName() + "#RedirectedFrom:" + url);
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setSessionId(crawlingInfoId);
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
throw new CrawlingAccessException("No url rule for " + url);
}
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (!(responseProcessor instanceof DefaultResponseProcessor)) {
throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
}
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
return (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
return null;
} catch (final Exception e) {
throw new CrawlingAccessException("Failed to parse " + url, e);
}
}
use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess by codelibs.
the class FessXpathTransformerTest method test_processXRobotsTag_noindex.
public void test_processXRobotsTag_noindex() throws Exception {
final String data = "<meta name=\"robots\" content=\"noindex\" /><a href=\"index.html\">aaa</a>";
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public boolean isCrawlerIgnoreRobotsTags() {
return false;
}
};
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
responseData.setResponseBody(data.getBytes());
responseData.addMetaData("X-Robots-Tag", "noindex");
try {
transformer.processXRobotsTag(responseData, new ResultData());
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess by codelibs.
the class FessXpathTransformerTest method test_processMetaRobots_none.
public void test_processMetaRobots_none() throws Exception {
final String data = "<meta name=\"robots\" content=\"none\" />";
final Document document = getDocument(data);
final FessXpathTransformer transformer = new FessXpathTransformer() {
protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
return Collections.emptyMap();
}
};
transformer.fessConfig = new FessConfig.SimpleImpl() {
private static final long serialVersionUID = 1L;
@Override
public boolean isCrawlerIgnoreRobotsTags() {
return false;
}
};
final ResponseData responseData = new ResponseData();
responseData.setUrl("http://example.com/");
try {
transformer.processMetaRobots(responseData, new ResultData(), document);
fail();
} catch (ChildUrlsException e) {
assertTrue(e.getChildUrlList().isEmpty());
} catch (Exception e) {
fail();
}
}
Aggregations