Search in sources :

Example 36 with CrawlerSystemException

use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.

the class XpathTransformerTest method test_getData_wrongName.

public void test_getData_wrongName() throws Exception {
    final String value = // 
    "<?xml version=\"1.0\"?>\n" + // 
    "<doc>\n" + // 
    "<field name=\"title\">タイトル</field>\n" + // 
    "<field name=\"body\">第一章 第一節 ほげほげふがふが LINK 第2章 第2節</field>\n" + "</doc>";
    final AccessResultDataImpl accessResultDataImpl = new AccessResultDataImpl();
    accessResultDataImpl.setData(value.getBytes(Constants.UTF_8));
    accessResultDataImpl.setEncoding(Constants.UTF_8);
    accessResultDataImpl.setTransformerName("transformer");
    try {
        final Object obj = xpathTransformer.getData(accessResultDataImpl);
        fail();
    } catch (final CrawlerSystemException e) {
    }
}
Also used : CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) AccessResultDataImpl(org.codelibs.fess.crawler.entity.AccessResultDataImpl)

Example 37 with CrawlerSystemException

use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.

the class CrawlerWebServer method stop.

public void stop() {
    try {
        server.stop();
        server.join();
    } catch (final Exception e) {
        throw new CrawlerSystemException(e);
    } finally {
        if (tempDocRoot) {
            docRoot.delete();
        }
    }
}
Also used : CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException)

Example 38 with CrawlerSystemException

use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.

the class CrawlerWebServer method createDocRoot.

protected static File createDocRoot(final int count) {
    try {
        final File tempDir = File.createTempFile("crawlerDocRoot", "");
        tempDir.delete();
        tempDir.mkdirs();
        // robots.txt
        StringBuilder buf = new StringBuilder();
        buf.append("User-agent: *").append('\n');
        buf.append("Disallow: /admin/").append('\n');
        buf.append("Disallow: /websvn/").append('\n');
        final File robotTxtFile = new File(tempDir, "robots.txt");
        FileUtil.writeBytes(robotTxtFile.getAbsolutePath(), buf.toString().getBytes("UTF-8"));
        robotTxtFile.deleteOnExit();
        // sitemaps.xml
        buf = new StringBuilder();
        buf.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append('\n');
        buf.append("<urlset ").append("xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append('\n');
        buf.append("<url>").append('\n');
        buf.append("<loc>http://localhost:7070/index.html</loc>").append('\n');
        buf.append("<loc>http://localhost:7070/file").append(count).append("-1.html").append("</loc>").append('\n');
        buf.append("</url>").append('\n');
        buf.append("</urlset>").append('\n');
        File sitemapsFile = new File(tempDir, "sitemaps.xml");
        FileUtil.writeBytes(sitemapsFile.getAbsolutePath(), buf.toString().getBytes("UTF-8"));
        robotTxtFile.deleteOnExit();
        // sitemaps.txt
        buf = new StringBuilder();
        buf.append("http://localhost:7070/index.html").append('\n');
        buf.append("http://localhost:7070/file").append(count).append("-1.html").append('\n');
        sitemapsFile = new File(tempDir, "sitemaps.txt");
        FileUtil.writeBytes(sitemapsFile.getAbsolutePath(), buf.toString().getBytes("UTF-8"));
        robotTxtFile.deleteOnExit();
        generateContents(tempDir, count);
        return tempDir;
    } catch (final Exception e) {
        throw new CrawlerSystemException(e);
    }
}
Also used : CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) File(java.io.File) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException)

Example 39 with CrawlerSystemException

use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess by codelibs.

the class DocumentHelper method processRequest.

public Map<String, Object> processRequest(final CrawlingConfig crawlingConfig, final String crawlingInfoId, final String url) {
    if (StringUtil.isBlank(crawlingInfoId)) {
        throw new CrawlingAccessException("sessionId is null.");
    }
    final CrawlerClientFactory crawlerClientFactory = crawlingConfig.initializeClientFactory(ComponentUtil::getCrawlerClientFactory);
    final CrawlerClient client = crawlerClientFactory.getClient(url);
    if (client == null) {
        throw new CrawlingAccessException("CrawlerClient is null for " + url);
    }
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            final Set<RequestData> childUrlList = new HashSet<>();
            childUrlList.add(RequestDataBuilder.newRequestData().get().url(responseData.getRedirectLocation()).build());
            throw new ChildUrlsException(childUrlList, this.getClass().getName() + "#RedirectedFrom:" + url);
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        responseData.setSessionId(crawlingInfoId);
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            throw new CrawlingAccessException("No url rule for " + url);
        }
        responseData.setRuleId(rule.getRuleId());
        final ResponseProcessor responseProcessor = rule.getResponseProcessor();
        if (!(responseProcessor instanceof DefaultResponseProcessor)) {
            throw new CrawlingAccessException("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", url: " + url);
        }
        final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
        final ResultData resultData = transformer.transform(responseData);
        final byte[] data = resultData.getData();
        if (data != null) {
            try {
                return (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
            } catch (final Exception e) {
                throw new CrawlerSystemException("Could not create an instance from bytes.", e);
            }
        }
        return null;
    } catch (final Exception e) {
        throw new CrawlingAccessException("Failed to parse " + url, e);
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ComponentNotFoundException(org.lastaflute.di.core.exception.ComponentNotFoundException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) RequestData(org.codelibs.fess.crawler.entity.RequestData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map) HashSet(java.util.HashSet)

Example 40 with CrawlerSystemException

use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess by codelibs.

the class CommandChain method executeCommand.

protected int executeCommand(final String[] commands, final String username, final String password) {
    if (commands == null || commands.length == 0) {
        throw new CommandExecutionException("command is empty.");
    }
    if (logger.isInfoEnabled()) {
        logger.info("Command: {}", String.join(" ", commands));
    }
    final String[] cmds = stream(commands).get(stream -> stream.map(s -> {
        if ("$USERNAME".equals(s)) {
            return username;
        }
        if ("$PASSWORD".equals(s)) {
            return password;
        }
        return s;
    }).toArray(n -> new String[n]));
    final ProcessBuilder pb = new ProcessBuilder(cmds);
    if (workingDirectory != null) {
        pb.directory(workingDirectory);
    }
    pb.redirectErrorStream(true);
    Process currentProcess = null;
    MonitorThread mt = null;
    try {
        currentProcess = pb.start();
        // monitoring
        mt = new MonitorThread(currentProcess, executionTimeout);
        mt.start();
        final InputStreamThread it = new InputStreamThread(currentProcess.getInputStream(), commandOutputEncoding, maxOutputLine);
        it.start();
        currentProcess.waitFor();
        it.join(5000);
        if (mt.isTeminated()) {
            throw new CommandExecutionException("The command execution is timeout: " + String.join(" ", commands));
        }
        final int exitValue = currentProcess.exitValue();
        if (logger.isInfoEnabled()) {
            logger.info("Exit Code: {} - Process Output:\n{}", exitValue, it.getOutput());
        }
        if (exitValue == 143 && mt.isTeminated()) {
            throw new CommandExecutionException("The command execution is timeout: " + String.join(" ", commands));
        }
        return exitValue;
    } catch (final CrawlerSystemException e) {
        throw e;
    } catch (final InterruptedException e) {
        if (mt != null && mt.isTeminated()) {
            throw new CommandExecutionException("The command execution is timeout: " + String.join(" ", commands), e);
        }
        throw new CommandExecutionException("Process terminated.", e);
    } catch (final Exception e) {
        throw new CommandExecutionException("Process terminated.", e);
    } finally {
        if (mt != null) {
            mt.setFinished(true);
            try {
                mt.interrupt();
            } catch (final Exception e) {
            // ignore
            }
        }
        if (currentProcess != null) {
            try {
                currentProcess.destroy();
            } catch (final Exception e) {
            // ignore
            }
        }
        currentProcess = null;
    }
}
Also used : ThreadUtil(org.codelibs.core.lang.ThreadUtil) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) StringUtil(org.codelibs.core.lang.StringUtil) IOException(java.io.IOException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) User(org.codelibs.fess.es.user.exentity.User) InputStreamReader(java.io.InputStreamReader) File(java.io.File) Constants(org.codelibs.fess.crawler.Constants) List(java.util.List) Logger(org.apache.logging.log4j.Logger) Charset(java.nio.charset.Charset) CommandExecutionException(org.codelibs.fess.exception.CommandExecutionException) BufferedReader(java.io.BufferedReader) LinkedList(java.util.LinkedList) UnsupportedEncodingException(java.io.UnsupportedEncodingException) LogManager(org.apache.logging.log4j.LogManager) InputStream(java.io.InputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) CommandExecutionException(org.codelibs.fess.exception.CommandExecutionException) IOException(java.io.IOException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) CommandExecutionException(org.codelibs.fess.exception.CommandExecutionException) UnsupportedEncodingException(java.io.UnsupportedEncodingException)

Aggregations

CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)41 IOException (java.io.IOException)16 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)13 File (java.io.File)11 InputStream (java.io.InputStream)11 UnsupportedEncodingException (java.io.UnsupportedEncodingException)10 BufferedInputStream (java.io.BufferedInputStream)9 ExtractException (org.codelibs.fess.crawler.exception.ExtractException)9 ExtractData (org.codelibs.fess.crawler.entity.ExtractData)8 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)8 Map (java.util.Map)7 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)7 MalformedURLException (java.net.MalformedURLException)6 HashMap (java.util.HashMap)6 AccessResultDataImpl (org.codelibs.fess.crawler.entity.AccessResultDataImpl)6 RequestData (org.codelibs.fess.crawler.entity.RequestData)6 ResultData (org.codelibs.fess.crawler.entity.ResultData)6 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)6 HashSet (java.util.HashSet)5 TransformerException (javax.xml.transform.TransformerException)5