Search in sources :

Example 1 with MaxLengthExceededException

use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.

the class AbstractCrawlerClientTest method test_checkMaxContentLength_1m.

public void test_checkMaxContentLength_1m() {
    AbstractCrawlerClient client = new AbstractCrawlerClient() {
    };
    ResponseData responseData = new ResponseData();
    responseData.setUrl("http://test.com/");
    client.setMaxContentLength(1000000L);
    client.checkMaxContentLength(responseData);
    responseData.setContentLength(-1);
    client.checkMaxContentLength(responseData);
    responseData.setContentLength(1000L);
    client.checkMaxContentLength(responseData);
    responseData.setContentLength(1000000L);
    client.checkMaxContentLength(responseData);
    responseData.setContentLength(1000001L);
    try {
        client.checkMaxContentLength(responseData);
        fail();
    } catch (MaxLengthExceededException e) {
    // ok
    }
    responseData.setContentLength(1000000000L);
    try {
        client.checkMaxContentLength(responseData);
        fail();
    } catch (MaxLengthExceededException e) {
    // ok
    }
    responseData.setContentLength(1000000000000L);
    try {
        client.checkMaxContentLength(responseData);
        fail();
    } catch (MaxLengthExceededException e) {
    // ok
    }
}
Also used : MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) ResponseData(org.codelibs.fess.crawler.entity.ResponseData)

Example 2 with MaxLengthExceededException

use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.

the class TarExtractorTest method test_getText_maxSize.

public void test_getText_maxSize() throws IOException {
    try (final InputStream in = ResourceUtil.getResourceAsStream("extractor/tar/test.tar")) {
        tarExtractor.setMaxContentSize(100);
        tarExtractor.getText(in, null);
        fail();
    } catch (MaxLengthExceededException e) {
    // pass
    }
    tarExtractor.setMaxContentSize(-1);
}
Also used : MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) InputStream(java.io.InputStream)

Example 3 with MaxLengthExceededException

use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.

the class ZipExtractorTest method test_getText_maxSize.

public void test_getText_maxSize() throws IOException {
    try (final InputStream in = ResourceUtil.getResourceAsStream("extractor/zip/test.zip")) {
        zipExtractor.setMaxContentSize(100);
        zipExtractor.getText(in, null);
        fail();
    } catch (MaxLengthExceededException e) {
    // pass
    }
    zipExtractor.setMaxContentSize(-1);
}
Also used : MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) InputStream(java.io.InputStream)

Example 4 with MaxLengthExceededException

use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.

the class FileSystemClient method getResponseData.

protected ResponseData getResponseData(final String uri, final boolean includeContent) {
    final ResponseData responseData = new ResponseData();
    try {
        responseData.setMethod(Constants.GET_METHOD);
        final String filePath = preprocessUri(uri);
        responseData.setUrl(filePath);
        File file = null;
        try {
            file = new File(new URI(filePath));
        } catch (final URISyntaxException e) {
            logger.warn("Could not parse url: " + filePath, e);
        }
        if (file == null) {
            responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
        } else if (file.isFile()) {
            // check file size
            responseData.setContentLength(file.length());
            checkMaxContentLength(responseData);
            try {
                final FileOwnerAttributeView ownerAttrView = Files.getFileAttributeView(file.toPath(), FileOwnerAttributeView.class);
                if (ownerAttrView != null) {
                    UserPrincipal owner = ownerAttrView.getOwner();
                    if (owner != null) {
                        responseData.addMetaData(FS_FILE_USER, owner.getName());
                    }
                }
            } catch (Exception e) {
                logger.warn("Failed to parse FileOwnerAttributeView.", e);
            }
            try {
                final AclFileAttributeView aclView = Files.getFileAttributeView(file.toPath(), AclFileAttributeView.class);
                if (aclView != null) {
                    responseData.addMetaData(FILE_ATTRIBUTE_VIEW, aclView);
                    responseData.addMetaData(FS_FILE_GROUPS, aclView.getAcl().stream().map(acl -> acl.principal().getName()).toArray(n -> new String[n]));
                }
            } catch (Exception e) {
                logger.warn("Failed to parse AclFileAttributeView.", e);
            }
            try {
                final PosixFileAttributeView posixView = Files.getFileAttributeView(file.toPath(), PosixFileAttributeView.class);
                if (posixView != null) {
                    responseData.addMetaData(FILE_ATTRIBUTE_VIEW, posixView);
                    responseData.addMetaData(FS_FILE_GROUPS, new String[] { posixView.readAttributes().group().getName() });
                }
            } catch (Exception e) {
                logger.warn("Failed to parse PosixFileAttributeView.", e);
            }
            responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
            responseData.setCharSet(geCharSet(file));
            responseData.setLastModified(new Date(file.lastModified()));
            if (file.canRead()) {
                final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
                try (final InputStream is = new BufferedInputStream(new FileInputStream(file))) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
                } catch (final Exception e) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
                }
                if (contentLengthHelper != null) {
                    final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
                    if (responseData.getContentLength() > maxLength) {
                        throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + filePath);
                    }
                }
                if (includeContent) {
                    if (file.length() < maxCachedContentSize) {
                        try (InputStream contentStream = new BufferedInputStream(new FileInputStream(file))) {
                            responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
                        } catch (final Exception e) {
                            logger.warn("I/O Exception.", e);
                            responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
                        }
                    } else {
                        responseData.setResponseBody(file, false);
                    }
                }
            } else {
                // Forbidden
                responseData.setHttpStatusCode(Constants.FORBIDDEN_STATUS_CODE);
                responseData.setMimeType(APPLICATION_OCTET_STREAM);
            }
        } else if (file.isDirectory()) {
            final Set<RequestData> requestDataSet = new HashSet<>();
            if (includeContent) {
                final File[] files = file.listFiles();
                if (files != null) {
                    for (final File f : files) {
                        final String chileUri = f.toURI().toASCIIString();
                        requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
                    }
                }
            }
            throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
        } else {
            responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
        }
    } catch (final CrawlerSystemException e) {
        CloseableUtil.closeQuietly(responseData);
        throw e;
    } catch (final Exception e) {
        CloseableUtil.closeQuietly(responseData);
        throw new CrawlingAccessException("Could not access " + uri, e);
    }
    return responseData;
}
Also used : FileOwnerAttributeView(java.nio.file.attribute.FileOwnerAttributeView) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) BufferedInputStream(java.io.BufferedInputStream) Date(java.util.Date) URISyntaxException(java.net.URISyntaxException) PosixFileAttributeView(java.nio.file.attribute.PosixFileAttributeView) LoggerFactory(org.slf4j.LoggerFactory) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) AbstractCrawlerClient(org.codelibs.fess.crawler.client.AbstractCrawlerClient) HashSet(java.util.HashSet) UserPrincipal(java.nio.file.attribute.UserPrincipal) URI(java.net.URI) ContentLengthHelper(org.codelibs.fess.crawler.helper.ContentLengthHelper) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) InputStreamUtil(org.codelibs.core.io.InputStreamUtil) AclFileAttributeView(java.nio.file.attribute.AclFileAttributeView) Logger(org.slf4j.Logger) Files(java.nio.file.Files) Resource(javax.annotation.Resource) StringUtil(org.codelibs.core.lang.StringUtil) Set(java.util.Set) FileInputStream(java.io.FileInputStream) FileOwnerAttributeView(java.nio.file.attribute.FileOwnerAttributeView) CrawlerContainer(org.codelibs.fess.crawler.container.CrawlerContainer) File(java.io.File) CloseableUtil(org.codelibs.core.io.CloseableUtil) Constants(org.codelibs.fess.crawler.Constants) URLEncoder(java.net.URLEncoder) RequestData(org.codelibs.fess.crawler.entity.RequestData) AccessTimeoutTarget(org.codelibs.fess.crawler.client.AccessTimeoutTarget) TimeoutManager(org.codelibs.core.timer.TimeoutManager) TimeoutTask(org.codelibs.core.timer.TimeoutTask) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) AclFileAttributeView(java.nio.file.attribute.AclFileAttributeView) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) UserPrincipal(java.nio.file.attribute.UserPrincipal) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) URISyntaxException(java.net.URISyntaxException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Date(java.util.Date) FileInputStream(java.io.FileInputStream) PosixFileAttributeView(java.nio.file.attribute.PosixFileAttributeView) BufferedInputStream(java.io.BufferedInputStream) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) File(java.io.File) HashSet(java.util.HashSet)

Example 5 with MaxLengthExceededException

use of org.codelibs.fess.crawler.exception.MaxLengthExceededException in project fess-crawler by codelibs.

the class HcHttpClient method processRobotsTxt.

protected void processRobotsTxt(final String url) {
    if (StringUtil.isBlank(url)) {
        throw new CrawlerSystemException("url is null or empty.");
    }
    if (robotsTxtHelper == null || !robotsTxtHelper.isEnabled()) {
        // not support robots.txt
        return;
    }
    // crawler context
    final CrawlerContext crawlerContext = CrawlingParameterUtil.getCrawlerContext();
    if (crawlerContext == null) {
        // wrong state
        return;
    }
    final int idx = url.indexOf('/', url.indexOf("://") + 3);
    String hostUrl;
    if (idx >= 0) {
        hostUrl = url.substring(0, idx);
    } else {
        hostUrl = url;
    }
    final String robotTxtUrl = hostUrl + "/robots.txt";
    // check url
    if (crawlerContext.getRobotsTxtUrlSet().contains(robotTxtUrl)) {
        if (logger.isDebugEnabled()) {
            logger.debug(robotTxtUrl + " is already visited.");
        }
        return;
    }
    if (logger.isInfoEnabled()) {
        logger.info("Checking URL: " + robotTxtUrl);
    }
    // add url to a set
    crawlerContext.getRobotsTxtUrlSet().add(robotTxtUrl);
    final HttpGet httpGet = new HttpGet(robotTxtUrl);
    // request header
    for (final Header header : requestHeaderList) {
        httpGet.addHeader(header);
    }
    HttpEntity httpEntity = null;
    try {
        // get a content
        final HttpResponse response = executeHttpClient(httpGet);
        httpEntity = response.getEntity();
        final int httpStatusCode = response.getStatusLine().getStatusCode();
        if (httpStatusCode == 200) {
            // check file size
            final Header contentLengthHeader = response.getFirstHeader("Content-Length");
            if (contentLengthHeader != null) {
                final String value = contentLengthHeader.getValue();
                final long contentLength = Long.parseLong(value);
                if (contentLengthHelper != null) {
                    final long maxLength = contentLengthHelper.getMaxLength("text/plain");
                    if (contentLength > maxLength) {
                        throw new MaxLengthExceededException("The content length (" + contentLength + " byte) is over " + maxLength + " byte. The url is " + robotTxtUrl);
                    }
                }
            }
            if (httpEntity != null) {
                final RobotsTxt robotsTxt = robotsTxtHelper.parse(httpEntity.getContent());
                if (robotsTxt != null) {
                    final String[] sitemaps = robotsTxt.getSitemaps();
                    if (sitemaps.length > 0) {
                        crawlerContext.addSitemaps(sitemaps);
                    }
                    final RobotsTxt.Directive directive = robotsTxt.getMatchedDirective(userAgent);
                    if (directive != null) {
                        if (useRobotsTxtDisallows) {
                            for (String urlPattern : directive.getDisallows()) {
                                if (StringUtil.isNotBlank(urlPattern)) {
                                    urlPattern = convertRobotsTxtPathPattern(urlPattern);
                                    crawlerContext.getUrlFilter().addExclude(hostUrl + urlPattern);
                                }
                            }
                        }
                        if (useRobotsTxtAllows) {
                            for (String urlPattern : directive.getAllows()) {
                                if (StringUtil.isNotBlank(urlPattern)) {
                                    urlPattern = convertRobotsTxtPathPattern(urlPattern);
                                    crawlerContext.getUrlFilter().addInclude(hostUrl + urlPattern);
                                }
                            }
                        }
                    }
                }
            }
        }
    } catch (final CrawlerSystemException e) {
        httpGet.abort();
        throw e;
    } catch (final Exception e) {
        httpGet.abort();
        throw new CrawlingAccessException("Could not process " + robotTxtUrl + ". ", e);
    } finally {
        EntityUtils.consumeQuietly(httpEntity);
    }
}
Also used : HttpEntity(org.apache.http.HttpEntity) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) HttpGet(org.apache.http.client.methods.HttpGet) HttpResponse(org.apache.http.HttpResponse) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ParseException(java.text.ParseException) NoRouteToHostException(java.net.NoRouteToHostException) SocketException(java.net.SocketException) ConnectException(java.net.ConnectException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) CrawlerContext(org.codelibs.fess.crawler.CrawlerContext) Header(org.apache.http.Header) BasicHeader(org.apache.http.message.BasicHeader) RobotsTxt(org.codelibs.fess.crawler.entity.RobotsTxt) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException)

Aggregations

MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)12 InputStream (java.io.InputStream)8 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)8 File (java.io.File)5 IOException (java.io.IOException)5 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)5 MimeTypeHelper (org.codelibs.fess.crawler.helper.MimeTypeHelper)5 BufferedInputStream (java.io.BufferedInputStream)4 MalformedURLException (java.net.MalformedURLException)4 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)4 FileInputStream (java.io.FileInputStream)3 Date (java.util.Date)3 HashMap (java.util.HashMap)3 HashSet (java.util.HashSet)3 ExtractException (org.codelibs.fess.crawler.exception.ExtractException)3 Extractor (org.codelibs.fess.crawler.extractor.Extractor)3 IgnoreCloseInputStream (org.codelibs.fess.crawler.util.IgnoreCloseInputStream)3 FileOutputStream (java.io.FileOutputStream)2 ConnectException (java.net.ConnectException)2 NoRouteToHostException (java.net.NoRouteToHostException)2