Search in sources :

Example 21 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class CrawlerThread method run.

/*
     * (non-Javadoc)
     *
     * @see java.lang.Runnable#run()
     */
@Override
public void run() {
    log(logHelper, LogType.START_THREAD, crawlerContext);
    int threadCheckCount = 0;
    // set urlQueue to thread
    CrawlingParameterUtil.setCrawlerContext(crawlerContext);
    CrawlingParameterUtil.setUrlQueueService(urlQueueService);
    CrawlingParameterUtil.setDataService(dataService);
    try {
        while (crawlerContext.getStatus() != CrawlerStatus.DONE && isContinue(threadCheckCount)) {
            final UrlQueue<?> urlQueue = urlQueueService.poll(crawlerContext.sessionId);
            if (isValid(urlQueue)) {
                ResponseData responseData = null;
                log(logHelper, LogType.START_CRAWLING, crawlerContext, urlQueue);
                try {
                    final CrawlerClient client = getClient(urlQueue.getUrl());
                    if (client == null) {
                        log(logHelper, LogType.UNSUPPORTED_URL_AT_CRAWLING_STARTED, crawlerContext, urlQueue);
                        continue;
                    }
                    startCrawling();
                    // set urlQueue to thread
                    CrawlingParameterUtil.setUrlQueue(urlQueue);
                    if (crawlerContext.intervalController != null) {
                        crawlerContext.intervalController.delay(IntervalController.PRE_PROCESSING);
                    }
                    final boolean contentUpdated = isContentUpdated(client, urlQueue);
                    if (contentUpdated) {
                        log(logHelper, LogType.GET_CONTENT, crawlerContext, urlQueue);
                        // access an url
                        final long startTime = SystemUtil.currentTimeMillis();
                        responseData = client.execute(RequestDataBuilder.newRequestData().method(urlQueue.getMethod()).url(urlQueue.getUrl()).build());
                        responseData.setExecutionTime(SystemUtil.currentTimeMillis() - startTime);
                        responseData.setParentUrl(urlQueue.getParentUrl());
                        responseData.setSessionId(crawlerContext.sessionId);
                        if (responseData.getRedirectLocation() == null) {
                            log(logHelper, LogType.PROCESS_RESPONSE, crawlerContext, urlQueue, responseData);
                            processResponse(urlQueue, responseData);
                        } else {
                            log(logHelper, LogType.REDIRECT_LOCATION, crawlerContext, urlQueue, responseData);
                            // redirect
                            storeChildUrl(responseData.getRedirectLocation(), urlQueue.getUrl(), null, urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
                        }
                    }
                    log(logHelper, LogType.FINISHED_CRAWLING, crawlerContext, urlQueue);
                } catch (final ChildUrlsException e) {
                    try {
                        final Set<RequestData> childUrlSet = e.getChildUrlList();
                        log(logHelper, LogType.PROCESS_CHILD_URLS_BY_EXCEPTION, crawlerContext, urlQueue, childUrlSet);
                        // add an url
                        storeChildUrls(childUrlSet, urlQueue.getUrl(), urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
                    } catch (final Exception e1) {
                        log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e1);
                    }
                    if (noWaitOnFolder) {
                        continue;
                    }
                } catch (final CrawlingAccessException e) {
                    log(logHelper, LogType.CRAWLING_ACCESS_EXCEPTION, crawlerContext, urlQueue, e);
                } catch (final Throwable e) {
                    log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e);
                } finally {
                    addSitemapsFromRobotsTxt(urlQueue);
                    if (responseData != null) {
                        CloseableUtil.closeQuietly(responseData);
                    }
                    if (crawlerContext.intervalController != null) {
                        crawlerContext.intervalController.delay(IntervalController.POST_PROCESSING);
                    }
                    // clear
                    threadCheckCount = 0;
                    // remove urlQueue from thread
                    CrawlingParameterUtil.setUrlQueue(null);
                    finishCrawling();
                }
            } else {
                log(logHelper, LogType.NO_URL_IN_QUEUE, crawlerContext, urlQueue, Integer.valueOf(threadCheckCount));
                if (crawlerContext.intervalController != null) {
                    crawlerContext.intervalController.delay(IntervalController.NO_URL_IN_QUEUE);
                }
                threadCheckCount++;
            }
            // interval
            if (crawlerContext.intervalController != null) {
                crawlerContext.intervalController.delay(IntervalController.WAIT_NEW_URL);
            }
        }
    } catch (final Throwable t) {
        log(logHelper, LogType.SYSTEM_ERROR, t);
    } finally {
        // remove crawlerContext from thread
        CrawlingParameterUtil.setCrawlerContext(null);
        CrawlingParameterUtil.setUrlQueueService(null);
        CrawlingParameterUtil.setDataService(null);
    }
    log(logHelper, LogType.FINISHED_THREAD, crawlerContext);
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) HashSet(java.util.HashSet) Set(java.util.Set) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException)

Example 22 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class SmbClient method getResponseData.

protected ResponseData getResponseData(final String uri, final boolean includeContent) {
    final ResponseData responseData = new ResponseData();
    responseData.setMethod(Constants.GET_METHOD);
    final String filePath = preprocessUri(uri);
    responseData.setUrl(filePath);
    SmbFile file = null;
    final SmbAuthentication smbAuthentication = smbAuthenticationHolder.get(filePath);
    if (logger.isDebugEnabled()) {
        logger.debug("Creating SmbFile: " + filePath);
    }
    try {
        if (smbAuthentication == null) {
            file = new SmbFile(filePath);
        } else {
            file = new SmbFile(filePath, smbAuthentication.getAuthentication());
        }
    } catch (final MalformedURLException e) {
        logger.warn("Could not parse url: " + filePath, e);
    }
    if (logger.isDebugEnabled()) {
        logger.debug("Processing SmbFile: " + filePath);
    }
    try {
        if (file == null) {
            responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
        } else if (file.isFile()) {
            if (logger.isDebugEnabled()) {
                logger.debug("Checking SmbFile Size: " + filePath);
            }
            responseData.setContentLength(file.length());
            checkMaxContentLength(responseData);
            responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
            responseData.setCharSet(geCharSet(file));
            responseData.setLastModified(new Date(file.lastModified()));
            responseData.addMetaData(SMB_CREATE_TIME, new Date(file.createTime()));
            try {
                if (logger.isDebugEnabled()) {
                    logger.debug("Parsing SmbFile Owner: " + filePath);
                }
                final SID ownerUser = file.getOwnerUser();
                if (ownerUser != null) {
                    final String[] ownerAttributes = { ownerUser.getAccountName(), ownerUser.getDomainName() };
                    responseData.addMetaData(SMB_OWNER_ATTRIBUTES, ownerAttributes);
                }
            } catch (final IOException e) {
                logger.warn("Cannot get owner of the file: " + filePath);
            }
            if (logger.isDebugEnabled()) {
                logger.debug("Parsing SmbFile ACL: " + filePath);
            }
            processAccessControlEntries(responseData, file);
            final Map<String, List<String>> headerFieldMap = file.getHeaderFields();
            if (headerFieldMap != null) {
                for (final Map.Entry<String, List<String>> entry : headerFieldMap.entrySet()) {
                    responseData.addMetaData(entry.getKey(), entry.getValue());
                }
            }
            if (file.canRead()) {
                final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
                if (includeContent) {
                    if (logger.isDebugEnabled()) {
                        logger.debug("Parsing SmbFile Content: " + filePath);
                    }
                    if (file.getContentLength() < maxCachedContentSize) {
                        try (InputStream contentStream = new BufferedInputStream(new SmbFileInputStream(file))) {
                            responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
                        } catch (final Exception e) {
                            logger.warn("I/O Exception.", e);
                            responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
                        }
                    } else {
                        File outputFile = null;
                        try {
                            outputFile = File.createTempFile("crawler-SmbClient-", ".out");
                            copy(file, outputFile);
                            responseData.setResponseBody(outputFile, true);
                        } catch (final Exception e) {
                            logger.warn("I/O Exception.", e);
                            responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
                            if (outputFile != null && !outputFile.delete()) {
                                logger.warn("Could not delete " + outputFile.getAbsolutePath());
                            }
                        }
                    }
                    if (logger.isDebugEnabled()) {
                        logger.debug("Parsing SmbFile MIME Type: " + filePath);
                    }
                    try (final InputStream is = responseData.getResponseBody()) {
                        responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
                    } catch (final Exception e) {
                        responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
                    }
                } else {
                    if (logger.isDebugEnabled()) {
                        logger.debug("Parsing SmbFile MIME Type: " + filePath);
                    }
                    try (final InputStream is = new SmbFileInputStream(file)) {
                        responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
                    } catch (final Exception e) {
                        responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
                    }
                }
                if (contentLengthHelper != null) {
                    final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
                    if (responseData.getContentLength() > maxLength) {
                        throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + filePath);
                    }
                }
            } else {
                // Forbidden
                responseData.setHttpStatusCode(Constants.FORBIDDEN_STATUS_CODE);
                responseData.setMimeType(APPLICATION_OCTET_STREAM);
            }
        } else if (file.isDirectory()) {
            if (logger.isDebugEnabled()) {
                logger.debug("Parsing SmbFile Directory: " + filePath);
            }
            final Set<RequestData> requestDataSet = new HashSet<>(100);
            if (includeContent) {
                final SmbFile[] files = file.listFiles();
                if (files != null) {
                    for (final SmbFile f : files) {
                        final String chileUri = f.toString();
                        requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
                    }
                }
            }
            throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
        } else {
            responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
        }
    } catch (final CrawlerSystemException e) {
        CloseableUtil.closeQuietly(responseData);
        throw e;
    } catch (final SmbException e) {
        CloseableUtil.closeQuietly(responseData);
        throw new CrawlingAccessException("Could not access " + uri, e);
    }
    return responseData;
}
Also used : MalformedURLException(java.net.MalformedURLException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) SmbException(jcifs.smb.SmbException) BufferedInputStream(java.io.BufferedInputStream) RequestData(org.codelibs.fess.crawler.entity.RequestData) HashSet(java.util.HashSet) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) BufferedInputStream(java.io.BufferedInputStream) SmbFileInputStream(jcifs.smb.SmbFileInputStream) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) IOException(java.io.IOException) Date(java.util.Date) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) IORuntimeException(org.codelibs.core.exception.IORuntimeException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) SmbException(jcifs.smb.SmbException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) SmbFile(jcifs.smb.SmbFile) SID(jcifs.smb.SID) SmbFileInputStream(jcifs.smb.SmbFileInputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Map(java.util.Map) File(java.io.File) SmbFile(jcifs.smb.SmbFile)

Example 23 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class ResponseDataUtil method createResponseBodyFile.

public static File createResponseBodyFile(final ResponseData responseData) {
    File tempFile = null;
    FileOutputStream fos = null;
    try (final InputStream is = responseData.getResponseBody()) {
        tempFile = File.createTempFile("crawler-", ".tmp");
        fos = new FileOutputStream(tempFile);
        CopyUtil.copy(is, fos);
    } catch (final Exception e) {
        // for deleting file
        CloseableUtil.closeQuietly(fos);
        // clean up
        if (tempFile != null && !tempFile.delete()) {
            logger.warn("Could not delete a temp file: " + tempFile);
        }
        throw new CrawlingAccessException("Could not read a response body: " + responseData.getUrl(), e);
    } finally {
        CloseableUtil.closeQuietly(fos);
    }
    return tempFile;
}
Also used : CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) InputStream(java.io.InputStream) FileOutputStream(java.io.FileOutputStream) File(java.io.File) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException)

Example 24 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class BinaryTransformer method transform.

/*
     * (non-Javadoc)
     *
     * @see
     * org.codelibs.fess.crawler.transformer.Transformer#getData(org.codelibs.fess.crawler.entity
     * .AccessResultData)
     */
@Override
public ResultData transform(final ResponseData responseData) {
    if (responseData == null || !responseData.hasResponseBody()) {
        throw new CrawlingAccessException("No response body.");
    }
    final ResultData resultData = new ResultData();
    resultData.setTransformerName(getName());
    try (BufferedInputStream bis = new BufferedInputStream(responseData.getResponseBody())) {
        resultData.setData(IOUtils.toByteArray(bis));
        resultData.setEncoding(responseData.getCharSet());
        return resultData;
    } catch (final IOException e) {
        throw new CrawlerSystemException("Could not convert the input stream.", e);
    }
}
Also used : ResultData(org.codelibs.fess.crawler.entity.ResultData) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) BufferedInputStream(java.io.BufferedInputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) IOException(java.io.IOException)

Example 25 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class HtmlTransformer method loadCharset.

protected String loadCharset(final InputStream inputStream) {
    BufferedInputStream bis = null;
    String encoding = null;
    try {
        bis = new BufferedInputStream(inputStream);
        final byte[] buffer = new byte[preloadSizeForCharset];
        final int size = bis.read(buffer);
        if (size != -1) {
            final String content = new String(buffer, 0, size);
            encoding = parseCharset(content);
        }
    } catch (final IOException e) {
        throw new CrawlingAccessException("Could not load a content.", e);
    }
    return normalizeEncoding(encoding);
}
Also used : BufferedInputStream(java.io.BufferedInputStream) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) IOException(java.io.IOException)

Aggregations

CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)36 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)14 InputStream (java.io.InputStream)13 Map (java.util.Map)9 IOException (java.io.IOException)8 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)8 BufferedInputStream (java.io.BufferedInputStream)7 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)7 ResultData (org.codelibs.fess.crawler.entity.ResultData)7 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)7 MalformedURLException (java.net.MalformedURLException)6 AccessResultData (org.codelibs.fess.crawler.entity.AccessResultData)6 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 File (java.io.File)5 LinkedHashMap (java.util.LinkedHashMap)5 FileInputStream (java.io.FileInputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 Date (java.util.Date)4