use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class XpathTransformerTest method test_getData_wrongName.
public void test_getData_wrongName() throws Exception {
final String value = //
"<?xml version=\"1.0\"?>\n" + //
"<doc>\n" + //
"<field name=\"title\">タイトル</field>\n" + //
"<field name=\"body\">第一章 第一節 ほげほげふがふが LINK 第2章 第2節</field>\n" + "</doc>";
final AccessResultDataImpl accessResultDataImpl = new AccessResultDataImpl();
accessResultDataImpl.setData(value.getBytes(Constants.UTF_8));
accessResultDataImpl.setEncoding(Constants.UTF_8);
accessResultDataImpl.setTransformerName("transformer");
try {
final Object obj = xpathTransformer.getData(accessResultDataImpl);
fail();
} catch (final CrawlerSystemException e) {
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class CrawlerWebServer method stop.
public void stop() {
try {
server.stop();
server.join();
} catch (final Exception e) {
throw new CrawlerSystemException(e);
} finally {
if (tempDocRoot) {
docRoot.delete();
}
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.
the class CrawlerWebServer method createDocRoot.
protected static File createDocRoot(final int count) {
try {
final File tempDir = File.createTempFile("crawlerDocRoot", "");
tempDir.delete();
tempDir.mkdirs();
// robots.txt
StringBuilder buf = new StringBuilder();
buf.append("User-agent: *").append('\n');
buf.append("Disallow: /admin/").append('\n');
buf.append("Disallow: /websvn/").append('\n');
final File robotTxtFile = new File(tempDir, "robots.txt");
FileUtil.writeBytes(robotTxtFile.getAbsolutePath(), buf.toString().getBytes("UTF-8"));
robotTxtFile.deleteOnExit();
// sitemaps.xml
buf = new StringBuilder();
buf.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append('\n');
buf.append("<urlset ").append("xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append('\n');
buf.append("<url>").append('\n');
buf.append("<loc>http://localhost:7070/index.html</loc>").append('\n');
buf.append("<loc>http://localhost:7070/file").append(count).append("-1.html").append("</loc>").append('\n');
buf.append("</url>").append('\n');
buf.append("</urlset>").append('\n');
File sitemapsFile = new File(tempDir, "sitemaps.xml");
FileUtil.writeBytes(sitemapsFile.getAbsolutePath(), buf.toString().getBytes("UTF-8"));
robotTxtFile.deleteOnExit();
// sitemaps.txt
buf = new StringBuilder();
buf.append("http://localhost:7070/index.html").append('\n');
buf.append("http://localhost:7070/file").append(count).append("-1.html").append('\n');
sitemapsFile = new File(tempDir, "sitemaps.txt");
FileUtil.writeBytes(sitemapsFile.getAbsolutePath(), buf.toString().getBytes("UTF-8"));
robotTxtFile.deleteOnExit();
generateContents(tempDir, count);
return tempDir;
} catch (final Exception e) {
throw new CrawlerSystemException(e);
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess by codelibs.
the class DocumentHelper method processRequest.
public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
if (StringUtil.isBlank(crawlingInfoId)) {
throw new CrawlingAccessException("sessionId is null.");
}
final CrawlerClientFactory crawlerClientFactory = crawlingConfig.initializeClientFactory(ComponentUtil::getCrawlerClientFactory);
final CrawlerClient client = crawlerClientFactory.getClient(url);
if (client == null) {
throw new CrawlingAccessException("CrawlerClient is null for " + url);
}
final long startTime = System.currentTimeMillis();
try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
if (responseData.getRedirectLocation() != null) {
final Set<RequestData> childUrlList = new HashSet<>();
childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
throw new ChildUrlsException(childUrlList, this.getClass().getName() + "#RedirectedFrom:" + url);
}
responseData.setExecutionTime(System.currentTimeMillis() - startTime);
responseData.setSessionId(crawlingInfoId);
final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
final Rule rule = ruleManager.getRule(responseData);
if (rule == null) {
throw new CrawlingAccessException("No url rule for " + url);
}
responseData.setRuleId(rule.getRuleId());
final ResponseProcessor responseProcessor = rule.getResponseProcessor();
if (!(responseProcessor instanceof DefaultResponseProcessor)) {
throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
}
final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
final ResultData resultData = transformer.transform(responseData);
final byte[] data = resultData.getData();
if (data != null) {
try {
return (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
} catch (final Exception e) {
throw new CrawlerSystemException("Could not create an instance from bytes.", e);
}
}
return null;
} catch (final Exception e) {
throw new CrawlingAccessException("Failed to parse " + url, e);
}
}
use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess by codelibs.
the class CommandChain method executeCommand.
protected int executeCommand(final String[] commands, final String username, final String password) {
if (commands == null || commands.length == 0) {
throw new CommandExecutionException("command is empty.");
}
if (logger.isInfoEnabled()) {
logger.info("Command: {}", String.join(" ", commands));
}
final String[] cmds = stream(commands).get(stream -> stream.map(s -> {
if ("$USERNAME".equals(s)) {
return username;
}
if ("$PASSWORD".equals(s)) {
return password;
}
return s;
}).toArray(n -> new String[n]));
final ProcessBuilder pb = new ProcessBuilder(cmds);
if (workingDirectory != null) {
pb.directory(workingDirectory);
}
pb.redirectErrorStream(true);
Process currentProcess = null;
MonitorThread mt = null;
try {
currentProcess = pb.start();
// monitoring
mt = new MonitorThread(currentProcess, executionTimeout);
mt.start();
final InputStreamThread it = new InputStreamThread(currentProcess.getInputStream(), commandOutputEncoding, maxOutputLine);
it.start();
currentProcess.waitFor();
it.join(5000);
if (mt.isTeminated()) {
throw new CommandExecutionException("The command execution is timeout: " + String.join(" ", commands));
}
final int exitValue = currentProcess.exitValue();
if (logger.isInfoEnabled()) {
logger.info("Exit Code: {} - Process Output:\n{}", exitValue, it.getOutput());
}
if (exitValue == 143 && mt.isTeminated()) {
throw new CommandExecutionException("The command execution is timeout: " + String.join(" ", commands));
}
return exitValue;
} catch (final CrawlerSystemException e) {
throw e;
} catch (final InterruptedException e) {
if (mt != null && mt.isTeminated()) {
throw new CommandExecutionException("The command execution is timeout: " + String.join(" ", commands), e);
}
throw new CommandExecutionException("Process terminated.", e);
} catch (final Exception e) {
throw new CommandExecutionException("Process terminated.", e);
} finally {
if (mt != null) {
mt.setFinished(true);
try {
mt.interrupt();
} catch (final Exception e) {
// ignore
}
}
if (currentProcess != null) {
try {
currentProcess.destroy();
} catch (final Exception e) {
// ignore
}
}
currentProcess = null;
}
}
Aggregations