Search in sources :

Example 11 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess by codelibs.

the class FileListIndexUpdateCallbackImpl method addDocument.

protected void addDocument(final Map<String, String> paramMap, final Map<String, Object> dataMap) {
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    synchronized (indexUpdateCallback) {
        // required check
        if (!dataMap.containsKey(fessConfig.getIndexFieldUrl()) || dataMap.get(fessConfig.getIndexFieldUrl()) == null) {
            logger.warn("Could not add a doc. Invalid data: {}", dataMap);
            return;
        }
        final String url = dataMap.get(fessConfig.getIndexFieldUrl()).toString();
        final CrawlerClient client = crawlerClientFactory.getClient(url);
        if (client == null) {
            logger.warn("CrawlerClient is null. Data: {}", dataMap);
            return;
        }
        final long maxAccessCount = getMaxAccessCount(paramMap, dataMap);
        long counter = 0;
        final Deque<String> urlQueue = new LinkedList<>();
        urlQueue.offer(url);
        while (!urlQueue.isEmpty() && (maxAccessCount < 0 || counter < maxAccessCount)) {
            final Map<String, Object> localDataMap = dataMap.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
            String processingUrl = urlQueue.poll();
            if (deleteUrlList.contains(processingUrl)) {
                // delete before indexing
                deleteDocuments();
            }
            try {
                for (int i = 0; i < maxRedirectCount; i++) {
                    processingUrl = processRequest(paramMap, localDataMap, processingUrl, client);
                    if (processingUrl == null) {
                        break;
                    }
                    counter++;
                    localDataMap.put(fessConfig.getIndexFieldUrl(), processingUrl);
                }
            } catch (final ChildUrlsException e) {
                e.getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
            } catch (final DataStoreCrawlingException e) {
                final Throwable cause = e.getCause();
                if (cause instanceof ChildUrlsException) {
                    ((ChildUrlsException) cause).getChildUrlList().stream().map(RequestData::getUrl).forEach(urlQueue::offer);
                } else if (maxAccessCount != 1L) {
                    throw e;
                } else {
                    logger.warn("Failed to access {}.", processingUrl, e);
                }
            }
        }
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) LinkedList(java.util.LinkedList) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) Map(java.util.Map)

Example 12 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess by codelibs.

the class FileListIndexUpdateCallbackImpl method processRequest.

protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            return responseData.getRedirectLocation();
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        if (dataMap.containsKey(Constants.SESSION_ID)) {
            responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
        } else {
            responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
        }
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            logger.warn("No url rule. Data: {}", dataMap);
        } else {
            responseData.setRuleId(rule.getRuleId());
            final ResponseProcessor responseProcessor = rule.getResponseProcessor();
            if (responseProcessor instanceof DefaultResponseProcessor) {
                final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
                final ResultData resultData = transformer.transform(responseData);
                final byte[] data = resultData.getData();
                if (data != null) {
                    try {
                        @SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
                        dataMap.putAll(responseDataMap);
                    } catch (final Exception e) {
                        throw new CrawlerSystemException("Could not create an instance from bytes.", e);
                    }
                }
                // remove
                String[] ignoreFields;
                if (paramMap.containsKey("ignore.field.names")) {
                    ignoreFields = paramMap.get("ignore.field.names").split(",");
                } else {
                    ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
                }
                stream(ignoreFields).of(stream -> stream.map(String::trim).forEach(s -> dataMap.remove(s)));
                indexUpdateCallback.store(paramMap, dataMap);
            } else {
                logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: {}, Data: {}", responseProcessor, dataMap);
            }
        }
        return null;
    } catch (final ChildUrlsException e) {
        throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(RequestData::getUrl).collect(Collectors.joining(", ")), e);
    } catch (final Exception e) {
        throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
    }
}
Also used : Constants(org.codelibs.fess.Constants) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) SearchEngineClient(org.codelibs.fess.es.client.SearchEngineClient) SerializeUtil(org.codelibs.core.io.SerializeUtil) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) Deque(java.util.Deque) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) LinkedList(java.util.LinkedList) ExecutorService(java.util.concurrent.ExecutorService) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) QueryBuilders(org.opensearch.index.query.QueryBuilders) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) ResultData(org.codelibs.fess.crawler.entity.ResultData) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) Logger(org.apache.logging.log4j.Logger) RequestData(org.codelibs.fess.crawler.entity.RequestData) ComponentUtil(org.codelibs.fess.util.ComponentUtil) SingletonLaContainer(org.lastaflute.di.core.SingletonLaContainer) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) LogManager(org.apache.logging.log4j.LogManager) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map)

Example 13 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.

the class FtpClient method doHead.

/*
     * (non-Javadoc)
     *
     * @see org.codelibs.robot.client.S2RobotClient#doHead(java.lang.String)
     */
@Override
public ResponseData doHead(final String url) {
    try {
        final ResponseData responseData = processRequest(url, false);
        responseData.setMethod(Constants.HEAD_METHOD);
        return responseData;
    } catch (final ChildUrlsException e) {
        return null;
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResponseData(org.codelibs.fess.crawler.entity.ResponseData)

Example 14 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.

the class FtpClient method updateResponseData.

protected void updateResponseData(final String uri, final boolean includeContent, final ResponseData responseData, FTPClient client, final FtpInfo ftpInfo, FTPFile file) {
    if (file == null) {
        responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
        responseData.setCharSet(charset);
        responseData.setContentLength(0);
        ftpClientQueue.offer(client);
        return;
    }
    if (file.isSymbolicLink()) {
        final String link = file.getLink();
        String redirect = null;
        if (link == null) {
            responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
            ftpClientQueue.offer(client);
            return;
        } else if (link.startsWith("/")) {
            redirect = ftpInfo.toUrl(file.getLink());
        } else if (link.startsWith("../")) {
            redirect = ftpInfo.toChildUrl(file.getLink());
        } else {
            redirect = ftpInfo.toChildUrl("../" + file.getLink());
        }
        if (!uri.equals(redirect)) {
            responseData.setHttpStatusCode(Constants.OK_STATUS);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
            responseData.setRedirectLocation(redirect);
            ftpClientQueue.offer(client);
            return;
        }
    }
    if (file.isFile()) {
        responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
        responseData.setCharSet(Constants.UTF_8);
        responseData.setLastModified(file.getTimestamp().getTime());
        // check file size
        responseData.setContentLength(file.getSize());
        checkMaxContentLength(responseData);
        if (file.getUser() != null) {
            responseData.addMetaData(FTP_FILE_USER, file.getUser());
        }
        if (file.getGroup() != null) {
            responseData.addMetaData(FTP_FILE_GROUP, file.getGroup());
        }
        if (includeContent) {
            File tempFile = null;
            File outputFile = null;
            try {
                tempFile = File.createTempFile("ftp-", ".tmp");
                try (OutputStream out = new BufferedOutputStream(new FileOutputStream(tempFile))) {
                    if (!client.retrieveFile(ftpInfo.getName(), out)) {
                        throw new CrawlingAccessException("Failed to retrieve: " + ftpInfo.toUrl());
                    }
                }
                final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
                try (InputStream is = new FileInputStream(tempFile)) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
                } catch (final Exception e) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
                }
                if (contentLengthHelper != null) {
                    final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
                    if (responseData.getContentLength() > maxLength) {
                        throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + uri);
                    }
                }
                responseData.setCharSet(geCharSet(tempFile));
                if (tempFile.length() < maxCachedContentSize) {
                    try (InputStream contentStream = new BufferedInputStream(new FileInputStream(tempFile))) {
                        responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
                    }
                } else {
                    outputFile = File.createTempFile("crawler-FtpClient-", ".out");
                    CopyUtil.copy(tempFile, outputFile);
                    responseData.setResponseBody(outputFile, true);
                }
                ftpClientQueue.offer(client);
            } catch (final CrawlingAccessException e) {
                ftpClientQueue.offer(client);
                throw e;
            } catch (final Exception e) {
                logger.warn("I/O Exception.", e);
                disconnectInternalClient(client);
                responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
            } finally {
                if (tempFile != null && !tempFile.delete()) {
                    logger.warn("Could not delete " + tempFile.getAbsolutePath());
                }
            }
        }
    } else if (file.isDirectory() || file.isSymbolicLink()) {
        final Set<RequestData> requestDataSet = new HashSet<>();
        if (includeContent) {
            try {
                final FTPFile[] ftpFiles = client.listFiles(ftpInfo.getName(), FTPFileFilters.NON_NULL);
                validateRequest(client);
                for (final FTPFile f : ftpFiles) {
                    final String chileUri = ftpInfo.toChildUrl(f.getName());
                    requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
                }
            } catch (final IOException e) {
                disconnectInternalClient(client);
                throw new CrawlingAccessException("Could not access " + uri, e);
            }
        }
        ftpClientQueue.offer(client);
        throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
    } else {
        responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
        responseData.setCharSet(charset);
        responseData.setContentLength(0);
        ftpClientQueue.offer(client);
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) HashSet(java.util.HashSet) Set(java.util.Set) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) FTPFile(org.apache.commons.net.ftp.FTPFile) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerLoginFailureException(org.codelibs.fess.crawler.exception.CrawlerLoginFailureException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) BufferedInputStream(java.io.BufferedInputStream) FileOutputStream(java.io.FileOutputStream) File(java.io.File) FTPFile(org.apache.commons.net.ftp.FTPFile) BufferedOutputStream(java.io.BufferedOutputStream)

Example 15 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.

the class CrawlerThread method run.

/*
     * (non-Javadoc)
     *
     * @see java.lang.Runnable#run()
     */
@Override
public void run() {
    log(logHelper, LogType.START_THREAD, crawlerContext);
    int threadCheckCount = 0;
    // set urlQueue to thread
    CrawlingParameterUtil.setCrawlerContext(crawlerContext);
    CrawlingParameterUtil.setUrlQueueService(urlQueueService);
    CrawlingParameterUtil.setDataService(dataService);
    try {
        while (crawlerContext.getStatus() != CrawlerStatus.DONE && isContinue(threadCheckCount)) {
            final UrlQueue<?> urlQueue = urlQueueService.poll(crawlerContext.sessionId);
            if (isValid(urlQueue)) {
                ResponseData responseData = null;
                log(logHelper, LogType.START_CRAWLING, crawlerContext, urlQueue);
                try {
                    final CrawlerClient client = getClient(urlQueue.getUrl());
                    if (client == null) {
                        log(logHelper, LogType.UNSUPPORTED_URL_AT_CRAWLING_STARTED, crawlerContext, urlQueue);
                        continue;
                    }
                    startCrawling();
                    // set urlQueue to thread
                    CrawlingParameterUtil.setUrlQueue(urlQueue);
                    if (crawlerContext.intervalController != null) {
                        crawlerContext.intervalController.delay(IntervalController.PRE_PROCESSING);
                    }
                    final boolean contentUpdated = isContentUpdated(client, urlQueue);
                    if (contentUpdated) {
                        log(logHelper, LogType.GET_CONTENT, crawlerContext, urlQueue);
                        // access an url
                        final long startTime = SystemUtil.currentTimeMillis();
                        responseData = client.execute(RequestDataBuilder.newRequestData().method(urlQueue.getMethod()).url(urlQueue.getUrl()).build());
                        responseData.setExecutionTime(SystemUtil.currentTimeMillis() - startTime);
                        responseData.setParentUrl(urlQueue.getParentUrl());
                        responseData.setSessionId(crawlerContext.sessionId);
                        if (responseData.getRedirectLocation() == null) {
                            log(logHelper, LogType.PROCESS_RESPONSE, crawlerContext, urlQueue, responseData);
                            processResponse(urlQueue, responseData);
                        } else {
                            log(logHelper, LogType.REDIRECT_LOCATION, crawlerContext, urlQueue, responseData);
                            // redirect
                            storeChildUrl(responseData.getRedirectLocation(), urlQueue.getUrl(), null, urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
                        }
                    }
                    log(logHelper, LogType.FINISHED_CRAWLING, crawlerContext, urlQueue);
                } catch (final ChildUrlsException e) {
                    try {
                        final Set<RequestData> childUrlSet = e.getChildUrlList();
                        log(logHelper, LogType.PROCESS_CHILD_URLS_BY_EXCEPTION, crawlerContext, urlQueue, childUrlSet);
                        // add an url
                        storeChildUrls(childUrlSet, urlQueue.getUrl(), urlQueue.getDepth() == null ? 1 : urlQueue.getDepth() + 1);
                    } catch (final Exception e1) {
                        log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e1);
                    }
                    if (noWaitOnFolder) {
                        continue;
                    }
                } catch (final CrawlingAccessException e) {
                    log(logHelper, LogType.CRAWLING_ACCESS_EXCEPTION, crawlerContext, urlQueue, e);
                } catch (final Throwable e) {
                    log(logHelper, LogType.CRAWLING_EXCETPION, crawlerContext, urlQueue, e);
                } finally {
                    addSitemapsFromRobotsTxt(urlQueue);
                    if (responseData != null) {
                        CloseableUtil.closeQuietly(responseData);
                    }
                    if (crawlerContext.intervalController != null) {
                        crawlerContext.intervalController.delay(IntervalController.POST_PROCESSING);
                    }
                    // clear
                    threadCheckCount = 0;
                    // remove urlQueue from thread
                    CrawlingParameterUtil.setUrlQueue(null);
                    finishCrawling();
                }
            } else {
                log(logHelper, LogType.NO_URL_IN_QUEUE, crawlerContext, urlQueue, Integer.valueOf(threadCheckCount));
                if (crawlerContext.intervalController != null) {
                    crawlerContext.intervalController.delay(IntervalController.NO_URL_IN_QUEUE);
                }
                threadCheckCount++;
            }
            // interval
            if (crawlerContext.intervalController != null) {
                crawlerContext.intervalController.delay(IntervalController.WAIT_NEW_URL);
            }
        }
    } catch (final Throwable t) {
        log(logHelper, LogType.SYSTEM_ERROR, t);
    } finally {
        // remove crawlerContext from thread
        CrawlingParameterUtil.setCrawlerContext(null);
        CrawlingParameterUtil.setUrlQueueService(null);
        CrawlingParameterUtil.setDataService(null);
    }
    log(logHelper, LogType.FINISHED_THREAD, crawlerContext);
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) HashSet(java.util.HashSet) Set(java.util.Set) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException)

Aggregations

ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)24 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)17 RequestData (org.codelibs.fess.crawler.entity.RequestData)11 ResultData (org.codelibs.fess.crawler.entity.ResultData)9 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)9 Set (java.util.Set)8 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)8 HashSet (java.util.HashSet)7 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)7 ConfigName (org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName)7 ComponentNotFoundException (org.lastaflute.di.core.exception.ComponentNotFoundException)7 Map (java.util.Map)6 CrawlerClient (org.codelibs.fess.crawler.client.CrawlerClient)5 BufferedInputStream (java.io.BufferedInputStream)4 IOException (java.io.IOException)4 MalformedURLException (java.net.MalformedURLException)4 HashMap (java.util.HashMap)4 RequestDataBuilder (org.codelibs.fess.crawler.builder.RequestDataBuilder)4 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)4 Document (org.w3c.dom.Document)4