Search in sources :

Example 11 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class FtpClient method getResponseData.

protected ResponseData getResponseData(final String uri, final boolean includeContent) {
    final ResponseData responseData = new ResponseData();
    FTPClient client = null;
    try {
        responseData.setMethod(Constants.GET_METHOD);
        final FtpInfo ftpInfo = new FtpInfo(uri);
        responseData.setUrl(ftpInfo.toUrl());
        client = getClient(ftpInfo);
        FTPFile file = null;
        client.changeWorkingDirectory(ftpInfo.getParent());
        validateRequest(client);
        if (ftpInfo.getName() == null) {
            // root directory
            final Set<RequestData> requestDataSet = new HashSet<>();
            if (includeContent) {
                try {
                    final FTPFile[] files = client.listFiles(ftpInfo.getParent(), FTPFileFilters.NON_NULL);
                    validateRequest(client);
                    for (final FTPFile f : files) {
                        final String chileUri = ftpInfo.toChildUrl(f.getName());
                        requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
                    }
                } catch (final IOException e) {
                    disconnectInternalClient(client);
                    throw new CrawlingAccessException("Could not access " + uri, e);
                }
            }
            ftpClientQueue.offer(client);
            throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
        }
        final FTPFile[] files = client.listFiles(null, FTPFileFilters.NON_NULL);
        validateRequest(client);
        for (final FTPFile f : files) {
            if (ftpInfo.getName().equals(f.getName())) {
                file = f;
                break;
            }
        }
        updateResponseData(uri, includeContent, responseData, client, ftpInfo, file);
    } catch (final CrawlerSystemException e) {
        CloseableUtil.closeQuietly(responseData);
        throw e;
    } catch (final Exception e) {
        CloseableUtil.closeQuietly(responseData);
        throw new CrawlingAccessException("Could not access " + uri, e);
    }
    return responseData;
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) FTPFile(org.apache.commons.net.ftp.FTPFile) IOException(java.io.IOException) FTPClient(org.apache.commons.net.ftp.FTPClient) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerLoginFailureException(org.codelibs.fess.crawler.exception.CrawlerLoginFailureException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) HashSet(java.util.HashSet)

Example 12 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class HcHttpClient method processRobotsTxt.

protected void processRobotsTxt(final String url) {
    if (StringUtil.isBlank(url)) {
        throw new CrawlerSystemException("url is null or empty.");
    }
    if (robotsTxtHelper == null || !robotsTxtHelper.isEnabled()) {
        // not support robots.txt
        return;
    }
    // crawler context
    final CrawlerContext crawlerContext = CrawlingParameterUtil.getCrawlerContext();
    if (crawlerContext == null) {
        // wrong state
        return;
    }
    final int idx = url.indexOf('/', url.indexOf("://") + 3);
    String hostUrl;
    if (idx >= 0) {
        hostUrl = url.substring(0, idx);
    } else {
        hostUrl = url;
    }
    final String robotTxtUrl = hostUrl + "/robots.txt";
    // check url
    if (crawlerContext.getRobotsTxtUrlSet().contains(robotTxtUrl)) {
        if (logger.isDebugEnabled()) {
            logger.debug(robotTxtUrl + " is already visited.");
        }
        return;
    }
    if (logger.isInfoEnabled()) {
        logger.info("Checking URL: " + robotTxtUrl);
    }
    // add url to a set
    crawlerContext.getRobotsTxtUrlSet().add(robotTxtUrl);
    final HttpGet httpGet = new HttpGet(robotTxtUrl);
    // request header
    for (final Header header : requestHeaderList) {
        httpGet.addHeader(header);
    }
    HttpEntity httpEntity = null;
    try {
        // get a content
        final HttpResponse response = executeHttpClient(httpGet);
        httpEntity = response.getEntity();
        final int httpStatusCode = response.getStatusLine().getStatusCode();
        if (httpStatusCode == 200) {
            // check file size
            final Header contentLengthHeader = response.getFirstHeader("Content-Length");
            if (contentLengthHeader != null) {
                final String value = contentLengthHeader.getValue();
                final long contentLength = Long.parseLong(value);
                if (contentLengthHelper != null) {
                    final long maxLength = contentLengthHelper.getMaxLength("text/plain");
                    if (contentLength > maxLength) {
                        throw new MaxLengthExceededException("The content length (" + contentLength + " byte) is over " + maxLength + " byte. The url is " + robotTxtUrl);
                    }
                }
            }
            if (httpEntity != null) {
                final RobotsTxt robotsTxt = robotsTxtHelper.parse(httpEntity.getContent());
                if (robotsTxt != null) {
                    final String[] sitemaps = robotsTxt.getSitemaps();
                    if (sitemaps.length > 0) {
                        crawlerContext.addSitemaps(sitemaps);
                    }
                    final RobotsTxt.Directive directive = robotsTxt.getMatchedDirective(userAgent);
                    if (directive != null) {
                        if (useRobotsTxtDisallows) {
                            for (String urlPattern : directive.getDisallows()) {
                                if (StringUtil.isNotBlank(urlPattern)) {
                                    urlPattern = convertRobotsTxtPathPattern(urlPattern);
                                    crawlerContext.getUrlFilter().addExclude(hostUrl + urlPattern);
                                }
                            }
                        }
                        if (useRobotsTxtAllows) {
                            for (String urlPattern : directive.getAllows()) {
                                if (StringUtil.isNotBlank(urlPattern)) {
                                    urlPattern = convertRobotsTxtPathPattern(urlPattern);
                                    crawlerContext.getUrlFilter().addInclude(hostUrl + urlPattern);
                                }
                            }
                        }
                    }
                }
            }
        }
    } catch (final CrawlerSystemException e) {
        httpGet.abort();
        throw e;
    } catch (final Exception e) {
        httpGet.abort();
        throw new CrawlingAccessException("Could not process " + robotTxtUrl + ". ", e);
    } finally {
        EntityUtils.consumeQuietly(httpEntity);
    }
}
Also used : HttpEntity(org.apache.http.HttpEntity) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) HttpGet(org.apache.http.client.methods.HttpGet) HttpResponse(org.apache.http.HttpResponse) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ParseException(java.text.ParseException) NoRouteToHostException(java.net.NoRouteToHostException) SocketException(java.net.SocketException) ConnectException(java.net.ConnectException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) CrawlerContext(org.codelibs.fess.crawler.CrawlerContext) Header(org.apache.http.Header) BasicHeader(org.apache.http.message.BasicHeader) RobotsTxt(org.codelibs.fess.crawler.entity.RobotsTxt) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException)

Example 13 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class HcHttpClient method processHttpMethod.

protected ResponseData processHttpMethod(final String url, final HttpUriRequest httpRequest) {
    try {
        processRobotsTxt(url);
    } catch (final CrawlingAccessException e) {
        if (logger.isInfoEnabled()) {
            final StringBuilder buf = new StringBuilder(100);
            buf.append(e.getMessage());
            if (e.getCause() != null) {
                buf.append(e.getCause().getMessage());
            }
            logger.info(buf.toString());
        } else if (logger.isDebugEnabled()) {
            logger.debug("Crawling Access Exception at " + url, e);
        }
    }
    // request header
    for (final Header header : requestHeaderList) {
        httpRequest.addHeader(header);
    }
    ResponseData responseData = new ResponseData();
    HttpEntity httpEntity = null;
    try {
        // get a content
        final HttpResponse response = executeHttpClient(httpRequest);
        httpEntity = response.getEntity();
        final int httpStatusCode = response.getStatusLine().getStatusCode();
        // redirect
        if (isRedirectHttpStatus(httpStatusCode)) {
            final Header locationHeader = response.getFirstHeader("location");
            if (locationHeader == null) {
                logger.warn("Invalid redirect location at " + url);
            } else {
                final String redirectLocation;
                if (locationHeader.getValue().startsWith("/")) {
                    redirectLocation = buildRedirectLocation(url, locationHeader.getValue());
                } else {
                    redirectLocation = locationHeader.getValue();
                }
                responseData = new ResponseData();
                responseData.setRedirectLocation(redirectLocation);
                return responseData;
            }
        }
        String contentType = null;
        final Header contentTypeHeader = response.getFirstHeader("Content-Type");
        if (contentTypeHeader != null) {
            contentType = contentTypeHeader.getValue();
            final int idx = contentType.indexOf(';');
            if (idx > 0) {
                contentType = contentType.substring(0, idx);
                if (APPLICATION_OCTET_STREAM.equals(contentType)) {
                    contentType = null;
                }
            }
        }
        long contentLength = 0;
        String contentEncoding = Constants.UTF_8;
        if (httpEntity == null) {
            responseData.setResponseBody(new byte[0]);
            if (contentType == null) {
                contentType = defaultMimeType;
            }
        } else {
            final InputStream responseBodyStream = httpEntity.getContent();
            final File outputFile = File.createTempFile("crawler-HcHttpClient-", ".out");
            DeferredFileOutputStream dfos = null;
            try {
                try {
                    dfos = new DeferredFileOutputStream((int) maxCachedContentSize, outputFile);
                    CopyUtil.copy(responseBodyStream, dfos);
                    dfos.flush();
                } finally {
                    CloseableUtil.closeQuietly(dfos);
                }
            } catch (final Exception e) {
                if (!outputFile.delete()) {
                    logger.warn("Could not delete " + outputFile.getAbsolutePath());
                }
                throw e;
            }
            if (dfos.isInMemory()) {
                responseData.setResponseBody(dfos.getData());
                contentLength = dfos.getData().length;
                if (!outputFile.delete()) {
                    logger.warn("Could not delete " + outputFile.getAbsolutePath());
                }
                if (contentType == null) {
                    try (InputStream is = new ByteArrayInputStream(dfos.getData())) {
                        contentType = mimeTypeHelper.getContentType(is, url);
                    } catch (final Exception e) {
                        logger.debug("Failed to detect mime-type.", e);
                        contentType = defaultMimeType;
                    }
                }
            } else {
                responseData.setResponseBody(outputFile, true);
                contentLength = outputFile.length();
                if (contentType == null) {
                    try (InputStream is = new FileInputStream(outputFile)) {
                        contentType = mimeTypeHelper.getContentType(is, url);
                    } catch (final Exception e) {
                        logger.debug("Failed to detect mime-type.", e);
                        contentType = defaultMimeType;
                    }
                }
            }
            final Header contentEncodingHeader = httpEntity.getContentEncoding();
            if (contentEncodingHeader != null) {
                contentEncoding = contentEncodingHeader.getValue();
            }
        }
        // check file size
        if (contentLengthHelper != null) {
            final long maxLength = contentLengthHelper.getMaxLength(contentType);
            if (contentLength > maxLength) {
                throw new MaxLengthExceededException("The content length (" + contentLength + " byte) is over " + maxLength + " byte. The url is " + url);
            }
        }
        responseData.setUrl(url);
        responseData.setCharSet(contentEncoding);
        if (httpRequest instanceof HttpHead) {
            responseData.setMethod(Constants.HEAD_METHOD);
        } else {
            responseData.setMethod(Constants.GET_METHOD);
        }
        responseData.setHttpStatusCode(httpStatusCode);
        for (final Header header : response.getAllHeaders()) {
            responseData.addMetaData(header.getName(), header.getValue());
        }
        responseData.setMimeType(contentType);
        final Header contentLengthHeader = response.getFirstHeader("Content-Length");
        if (contentLengthHeader == null) {
            responseData.setContentLength(contentLength);
        } else {
            final String value = contentLengthHeader.getValue();
            try {
                responseData.setContentLength(Long.parseLong(value));
            } catch (final Exception e) {
                responseData.setContentLength(contentLength);
            }
        }
        checkMaxContentLength(responseData);
        final Header lastModifiedHeader = response.getFirstHeader("Last-Modified");
        if (lastModifiedHeader != null) {
            final String value = lastModifiedHeader.getValue();
            if (StringUtil.isNotBlank(value)) {
                final Date d = parseLastModified(value);
                if (d != null) {
                    responseData.setLastModified(d);
                }
            }
        }
        return responseData;
    } catch (final UnknownHostException e) {
        closeResources(httpRequest, responseData);
        throw new CrawlingAccessException("Unknown host(" + e.getMessage() + "): " + url, e);
    } catch (final NoRouteToHostException e) {
        closeResources(httpRequest, responseData);
        throw new CrawlingAccessException("No route to host(" + e.getMessage() + "): " + url, e);
    } catch (final ConnectException e) {
        closeResources(httpRequest, responseData);
        throw new CrawlingAccessException("Connection time out(" + e.getMessage() + "): " + url, e);
    } catch (final SocketException e) {
        closeResources(httpRequest, responseData);
        throw new CrawlingAccessException("Socket exception(" + e.getMessage() + "): " + url, e);
    } catch (final IOException e) {
        closeResources(httpRequest, responseData);
        throw new CrawlingAccessException("I/O exception(" + e.getMessage() + "): " + url, e);
    } catch (final CrawlerSystemException e) {
        closeResources(httpRequest, responseData);
        throw e;
    } catch (final Exception e) {
        closeResources(httpRequest, responseData);
        throw new CrawlerSystemException("Failed to access " + url, e);
    } finally {
        EntityUtils.consumeQuietly(httpEntity);
    }
}
Also used : SocketException(java.net.SocketException) HttpEntity(org.apache.http.HttpEntity) UnknownHostException(java.net.UnknownHostException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) ByteArrayInputStream(java.io.ByteArrayInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) HttpResponse(org.apache.http.HttpResponse) IOException(java.io.IOException) NoRouteToHostException(java.net.NoRouteToHostException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ParseException(java.text.ParseException) NoRouteToHostException(java.net.NoRouteToHostException) SocketException(java.net.SocketException) ConnectException(java.net.ConnectException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) UnknownHostException(java.net.UnknownHostException) FileInputStream(java.io.FileInputStream) HttpHead(org.apache.http.client.methods.HttpHead) Date(java.util.Date) Header(org.apache.http.Header) BasicHeader(org.apache.http.message.BasicHeader) ByteArrayInputStream(java.io.ByteArrayInputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) DeferredFileOutputStream(org.apache.commons.io.output.DeferredFileOutputStream) File(java.io.File) ConnectException(java.net.ConnectException)

Example 14 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class SitemapsHelper method parse.

protected SitemapSet parse(final InputStream in, final boolean recursive) {
    final BufferedInputStream bis = new BufferedInputStream(in);
    bis.mark(preloadSize);
    String preloadDate = StringUtil.EMPTY;
    final byte[] bytes = new byte[preloadSize];
    try {
        if (bis.read(bytes) == -1) {
            throw new CrawlingAccessException("No sitemaps data.");
        }
        preloadDate = new String(bytes, Constants.UTF_8);
        if (preloadDate.indexOf("<urlset") >= 0) {
            // XML Sitemaps
            bis.reset();
            return parseXmlSitemaps(bis);
        } else if (preloadDate.indexOf("<sitemapindex") >= 0) {
            // XML Sitemaps Index
            bis.reset();
            return parseXmlSitemapsIndex(bis);
        } else if (preloadDate.startsWith("http://") || preloadDate.startsWith("https://")) {
            // Text Sitemaps Index
            bis.reset();
            return parseTextSitemaps(bis);
        } else {
            // gz
            bis.reset();
            return parse(new GZIPInputStream(bis), false);
        }
    } catch (final CrawlingAccessException e) {
        throw e;
    } catch (final Exception e) {
        throw new CrawlingAccessException("Could not parse Sitemaps: " + preloadDate, e);
    }
}
Also used : GZIPInputStream(java.util.zip.GZIPInputStream) BufferedInputStream(java.io.BufferedInputStream) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) SitemapsException(org.codelibs.fess.crawler.exception.SitemapsException)

Example 15 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.

the class BaseThumbnailGenerator method process.

protected boolean process(final String id, final Predicate<ResponseData> consumer) {
    return process(id, (configId, url) -> {
        final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
        final CrawlingConfig config = crawlingConfigHelper.getCrawlingConfig(configId);
        if (config == null) {
            throw new ThumbnailGenerationException("No CrawlingConfig: " + configId);
        }
        if (logger.isInfoEnabled()) {
            logger.info("Generating Thumbnail: {}", url);
        }
        final CrawlerClientFactory crawlerClientFactory = config.initializeClientFactory(() -> ComponentUtil.getComponent(CrawlerClientFactory.class));
        final CrawlerClient client = crawlerClientFactory.getClient(url);
        if (client == null) {
            throw new ThumbnailGenerationException("No CrawlerClient: " + configId + ", url: " + url);
        }
        String u = url;
        for (int i = 0; i < maxRedirectCount; i++) {
            try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(u).build())) {
                if (StringUtil.isNotBlank(responseData.getRedirectLocation())) {
                    u = responseData.getRedirectLocation();
                    continue;
                }
                if (StringUtil.isBlank(responseData.getUrl())) {
                    throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url + " (Response URL is empty)");
                }
                return consumer.test(responseData);
            } catch (final CrawlingAccessException e) {
                if (logger.isDebugEnabled()) {
                    throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url, e);
                }
                throw new ThumbnailGenerationException(e.getMessage());
            } catch (final Exception e) {
                throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url, e);
            }
        }
        throw new ThumbnailGenerationException("Failed to process a thumbnail content: " + url + " (Redirect Loop)");
    });
}
Also used : CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ThumbnailGenerationException(org.codelibs.fess.exception.ThumbnailGenerationException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) ThumbnailGenerationException(org.codelibs.fess.exception.ThumbnailGenerationException)

Aggregations

CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)36 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)14 InputStream (java.io.InputStream)13 Map (java.util.Map)9 IOException (java.io.IOException)8 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)8 BufferedInputStream (java.io.BufferedInputStream)7 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)7 ResultData (org.codelibs.fess.crawler.entity.ResultData)7 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)7 MalformedURLException (java.net.MalformedURLException)6 AccessResultData (org.codelibs.fess.crawler.entity.AccessResultData)6 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 File (java.io.File)5 LinkedHashMap (java.util.LinkedHashMap)5 FileInputStream (java.io.FileInputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 Date (java.util.Date)4