Search in sources :

Example 16 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.

the class SmbClient method doHead.

/*
     * (non-Javadoc)
     *
     * @see org.codelibs.fess.crawler.client.CrawlerClient#doHead(java.lang.String)
     */
@Override
public ResponseData doHead(final String url) {
    try {
        final ResponseData responseData = processRequest(url, false);
        responseData.setMethod(Constants.HEAD_METHOD);
        return responseData;
    } catch (final ChildUrlsException e) {
        return null;
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResponseData(org.codelibs.fess.crawler.entity.ResponseData)

Example 17 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.

the class SmbClient method getResponseData.

protected ResponseData getResponseData(final String uri, final boolean includeContent) {
    final ResponseData responseData = new ResponseData();
    responseData.setMethod(Constants.GET_METHOD);
    final String filePath = preprocessUri(uri);
    responseData.setUrl(filePath);
    SmbFile file = null;
    final SmbAuthentication smbAuthentication = smbAuthenticationHolder.get(filePath);
    if (logger.isDebugEnabled()) {
        logger.debug("Creating SmbFile: " + filePath);
    }
    try {
        if (smbAuthentication == null) {
            file = new SmbFile(filePath);
        } else {
            file = new SmbFile(filePath, smbAuthentication.getAuthentication());
        }
    } catch (final MalformedURLException e) {
        logger.warn("Could not parse url: " + filePath, e);
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Processing SmbFile: " + filePath);
    }
    try {
        if (file == null) {
            responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
        } else if (file.isFile()) {
            if (logger.isDebugEnabled()) {
                logger.debug("Checking SmbFile Size: " + filePath);
            }
            responseData.setContentLength(file.length());
            checkMaxContentLength(responseData);
            responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
            responseData.setCharSet(geCharSet(file));
            responseData.setLastModified(new Date(file.lastModified()));
            responseData.addMetaData(SMB_CREATE_TIME, new Date(file.createTime()));
            try {
                if (logger.isDebugEnabled()) {
                    logger.debug("Parsing SmbFile Owner: " + filePath);
                }
                final SID ownerUser = file.getOwnerUser();
                if (ownerUser != null) {
                    final String[] ownerAttributes = { ownerUser.getAccountName(), ownerUser.getDomainName() };
                    responseData.addMetaData(SMB_OWNER_ATTRIBUTES, ownerAttributes);
                }
            } catch (final IOException e) {
                logger.warn("Cannot get owner of the file: " + filePath);
            }
            if (logger.isDebugEnabled()) {
                logger.debug("Parsing SmbFile ACL: " + filePath);
            }
            processAccessControlEntries(responseData, file);
            final Map<String, List<String>> headerFieldMap = file.getHeaderFields();
            if (headerFieldMap != null) {
                for (final Map.Entry<String, List<String>> entry : headerFieldMap.entrySet()) {
                    responseData.addMetaData(entry.getKey(), entry.getValue());
                }
            }
            if (file.canRead()) {
                final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
                if (includeContent) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("Parsing SmbFile Content: " + filePath);
                    }
                    if (file.getContentLength() < maxCachedContentSize) {
                        try (InputStream contentStream = new BufferedInputStream(new SmbFileInputStream(file))) {
                            responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
                        } catch (final Exception e) {
                            logger.warn("I/O Exception.", e);
                            responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
                        }
                    } else {
                        File outputFile = null;
                        try {
                            outputFile = File.createTempFile("crawler-SmbClient-", ".out");
                            copy(file, outputFile);
                            responseData.setResponseBody(outputFile, true);
                        } catch (final Exception e) {
                            logger.warn("I/O Exception.", e);
                            responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
                            if (outputFile != null && !outputFile.delete()) {
                                logger.warn("Could not delete " + outputFile.getAbsolutePath());
                            }
                        }
                    }
                    if (logger.isDebugEnabled()) {
                        logger.debug("Parsing SmbFile MIME Type: " + filePath);
                    }
                    try (final InputStream is = responseData.getResponseBody()) {
                        responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
                    } catch (final Exception e) {
                        responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
                    }
                } else {
                    if (logger.isDebugEnabled()) {
                        logger.debug("Parsing SmbFile MIME Type: " + filePath);
                    }
                    try (final InputStream is = new SmbFileInputStream(file)) {
                        responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
                    } catch (final Exception e) {
                        responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
                    }
                }
                if (contentLengthHelper != null) {
                    final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
                    if (responseData.getContentLength() > maxLength) {
                        throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + filePath);
                    }
                }
            } else {
                // Forbidden
                responseData.setHttpStatusCode(Constants.FORBIDDEN_STATUS_CODE);
                responseData.setMimeType(APPLICATION_OCTET_STREAM);
            }
        } else if (file.isDirectory()) {
            if (logger.isDebugEnabled()) {
                logger.debug("Parsing SmbFile Directory: " + filePath);
            }
            final Set<RequestData> requestDataSet = new HashSet<>(100);
            if (includeContent) {
                final SmbFile[] files = file.listFiles();
                if (files != null) {
                    for (final SmbFile f : files) {
                        final String chileUri = f.toString();
                        requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
                    }
                }
            }
            throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
        } else {
            responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
        }
    } catch (final CrawlerSystemException e) {
        CloseableUtil.closeQuietly(responseData);
        throw e;
    } catch (final SmbException e) {
        CloseableUtil.closeQuietly(responseData);
        throw new CrawlingAccessException("Could not access " + uri, e);
    }
    return responseData;
}
Also used : MalformedURLException(java.net.MalformedURLException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) SmbException(jcifs.smb.SmbException) BufferedInputStream(java.io.BufferedInputStream) RequestData(org.codelibs.fess.crawler.entity.RequestData) HashSet(java.util.HashSet) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) BufferedInputStream(java.io.BufferedInputStream) SmbFileInputStream(jcifs.smb.SmbFileInputStream) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) IOException(java.io.IOException) Date(java.util.Date) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) IORuntimeException(org.codelibs.core.exception.IORuntimeException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) SmbException(jcifs.smb.SmbException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) SmbFile(jcifs.smb.SmbFile) SID(jcifs.smb.SID) SmbFileInputStream(jcifs.smb.SmbFileInputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Map(java.util.Map) File(java.io.File) SmbFile(jcifs.smb.SmbFile)

Example 18 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.

the class SitemapsResponseProcessor method process.

@Override
public void process(final ResponseData responseData) {
    final SitemapsHelper sitemapsHelper = crawlerContainer.getComponent("sitemapsHelper");
    try (final InputStream responseBody = responseData.getResponseBody()) {
        final SitemapSet sitemapSet = sitemapsHelper.parse(responseBody);
        final Set<RequestData> requestDataSet = new LinkedHashSet<>();
        for (final Sitemap sitemap : sitemapSet.getSitemaps()) {
            if (sitemap != null) {
                requestDataSet.add(RequestDataBuilder.newRequestData().get().url(sitemap.getLoc()).build());
            }
        }
        throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#process");
    } catch (final IOException e) {
        throw new IORuntimeException(e);
    }
}
Also used : LinkedHashSet(java.util.LinkedHashSet) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Sitemap(org.codelibs.fess.crawler.entity.Sitemap) IORuntimeException(org.codelibs.core.exception.IORuntimeException) InputStream(java.io.InputStream) RequestData(org.codelibs.fess.crawler.entity.RequestData) SitemapSet(org.codelibs.fess.crawler.entity.SitemapSet) IOException(java.io.IOException) SitemapsHelper(org.codelibs.fess.crawler.helper.SitemapsHelper)

Example 19 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.

the class FileSystemClientTest method test_doGet_dir.

public void test_doGet_dir() {
    final File file = ResourceUtil.getResourceAsFile("test");
    String path = file.getAbsolutePath();
    if (!path.startsWith("/")) {
        path = "/" + path.replace('\\', '/');
    }
    try {
        fsClient.doGet("file://" + path);
        fail();
    } catch (final ChildUrlsException e) {
        final Set<RequestData> urlSet = e.getChildUrlList();
        for (final RequestData requestData : urlSet.toArray(new RequestData[urlSet.size()])) {
            String url = requestData.getUrl();
            if (url.indexOf(".svn") < 0) {
                assertTrue(url.contains("test/dir1") || url.contains("test/dir2") || url.contains("test/text1.txt") || url.contains("test/text2.txt") || url.contains("test/text%203.txt"));
            }
        }
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Set(java.util.Set) RequestData(org.codelibs.fess.crawler.entity.RequestData) File(java.io.File)

Example 20 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.

the class FtpClientTest method test_doGet_root_dir.

public void test_doGet_root_dir() throws FtpException {
    FtpServer server = null;
    try {
        String username = "testuser";
        String password = "testpass";
        server = startFtpServer(FTP_PORT, username, password);
        Map<String, Object> params = new HashMap<String, Object>();
        FtpAuthentication auth = new FtpAuthentication();
        auth.setUsername(username);
        auth.setPassword(password);
        params.put(FtpClient.FTP_AUTHENTICATIONS_PROPERTY, new FtpAuthentication[] { auth });
        ftpClient.setInitParameterMap(params);
        ftpClient.doGet("ftp://localhost:" + FTP_PORT + "/");
        fail();
    } catch (final ChildUrlsException e) {
        final Set<RequestData> urlSet = e.getChildUrlList();
        assertEquals(5, urlSet.size());
        for (final RequestData requestData : urlSet.toArray(new RequestData[urlSet.size()])) {
            String url = requestData.getUrl();
            assertTrue(url.contains("dir1") || url.contains("dir2") || url.contains("text1.txt") || url.contains("text2.txt") || url.contains("text 3.txt"));
        }
    } finally {
        if (server != null) {
            server.stop();
        }
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Set(java.util.Set) HashMap(java.util.HashMap) RequestData(org.codelibs.fess.crawler.entity.RequestData) FtpServer(org.apache.ftpserver.FtpServer)

Aggregations

ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)24 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)17 RequestData (org.codelibs.fess.crawler.entity.RequestData)11 ResultData (org.codelibs.fess.crawler.entity.ResultData)9 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)9 Set (java.util.Set)8 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)8 HashSet (java.util.HashSet)7 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)7 ConfigName (org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName)7 ComponentNotFoundException (org.lastaflute.di.core.exception.ComponentNotFoundException)7 Map (java.util.Map)6 CrawlerClient (org.codelibs.fess.crawler.client.CrawlerClient)5 BufferedInputStream (java.io.BufferedInputStream)4 IOException (java.io.IOException)4 MalformedURLException (java.net.MalformedURLException)4 HashMap (java.util.HashMap)4 RequestDataBuilder (org.codelibs.fess.crawler.builder.RequestDataBuilder)4 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)4 Document (org.w3c.dom.Document)4