Search in sources :

Example 1 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess by codelibs.

the class FileListIndexUpdateCallbackImpl method processRequest.

protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            return responseData.getRedirectLocation();
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        if (dataMap.containsKey(Constants.SESSION_ID)) {
            responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
        } else {
            responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
        }
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            logger.warn("No url rule. Data: " + dataMap);
        } else {
            responseData.setRuleId(rule.getRuleId());
            final ResponseProcessor responseProcessor = rule.getResponseProcessor();
            if (responseProcessor instanceof DefaultResponseProcessor) {
                final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
                final ResultData resultData = transformer.transform(responseData);
                final byte[] data = resultData.getData();
                if (data != null) {
                    try {
                        @SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
                        dataMap.putAll(responseDataMap);
                    } catch (final Exception e) {
                        throw new CrawlerSystemException("Could not create an instance from bytes.", e);
                    }
                }
                // remove
                String[] ignoreFields;
                if (paramMap.containsKey("ignore.field.names")) {
                    ignoreFields = paramMap.get("ignore.field.names").split(",");
                } else {
                    ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
                }
                stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
                indexUpdateCallback.store(paramMap, dataMap);
            } else {
                logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", Data: " + dataMap);
            }
        }
        return null;
    } catch (final ChildUrlsException e) {
        throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
    } catch (final Exception e) {
        throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
    }
}
Also used : Constants(org.codelibs.fess.Constants) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) LoggerFactory(org.slf4j.LoggerFactory) SerializeUtil(org.codelibs.core.io.SerializeUtil) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) IndexUpdateCallback(org.codelibs.fess.ds.IndexUpdateCallback) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) ExecutorService(java.util.concurrent.ExecutorService) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) ResultData(org.codelibs.fess.crawler.entity.ResultData) FessEsClient(org.codelibs.fess.es.client.FessEsClient) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) ComponentUtil(org.codelibs.fess.util.ComponentUtil) SingletonLaContainer(org.lastaflute.di.core.SingletonLaContainer) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map)

Example 2 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.

the class FileSystemClient method getResponseData.

protected ResponseData getResponseData(final String uri, final boolean includeContent) {
    final ResponseData responseData = new ResponseData();
    try {
        responseData.setMethod(Constants.GET_METHOD);
        final String filePath = preprocessUri(uri);
        responseData.setUrl(filePath);
        File file = null;
        try {
            file = new File(new URI(filePath));
        } catch (final URISyntaxException e) {
            logger.warn("Could not parse url: " + filePath, e);
        }
        if (file == null) {
            responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
        } else if (file.isFile()) {
            // check file size
            responseData.setContentLength(file.length());
            checkMaxContentLength(responseData);
            try {
                final FileOwnerAttributeView ownerAttrView = Files.getFileAttributeView(file.toPath(), FileOwnerAttributeView.class);
                if (ownerAttrView != null) {
                    UserPrincipal owner = ownerAttrView.getOwner();
                    if (owner != null) {
                        responseData.addMetaData(FS_FILE_USER, owner.getName());
                    }
                }
            } catch (Exception e) {
                logger.warn("Failed to parse FileOwnerAttributeView.", e);
            }
            try {
                final AclFileAttributeView aclView = Files.getFileAttributeView(file.toPath(), AclFileAttributeView.class);
                if (aclView != null) {
                    responseData.addMetaData(FILE_ATTRIBUTE_VIEW, aclView);
                    responseData.addMetaData(FS_FILE_GROUPS, aclView.getAcl().stream().map(acl -> acl.principal().getName()).toArray(n -> new String[n]));
                }
            } catch (Exception e) {
                logger.warn("Failed to parse AclFileAttributeView.", e);
            }
            try {
                final PosixFileAttributeView posixView = Files.getFileAttributeView(file.toPath(), PosixFileAttributeView.class);
                if (posixView != null) {
                    responseData.addMetaData(FILE_ATTRIBUTE_VIEW, posixView);
                    responseData.addMetaData(FS_FILE_GROUPS, new String[] { posixView.readAttributes().group().getName() });
                }
            } catch (Exception e) {
                logger.warn("Failed to parse PosixFileAttributeView.", e);
            }
            responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
            responseData.setCharSet(geCharSet(file));
            responseData.setLastModified(new Date(file.lastModified()));
            if (file.canRead()) {
                final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
                try (final InputStream is = new BufferedInputStream(new FileInputStream(file))) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
                } catch (final Exception e) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
                }
                if (contentLengthHelper != null) {
                    final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
                    if (responseData.getContentLength() > maxLength) {
                        throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + filePath);
                    }
                }
                if (includeContent) {
                    if (file.length() < maxCachedContentSize) {
                        try (InputStream contentStream = new BufferedInputStream(new FileInputStream(file))) {
                            responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
                        } catch (final Exception e) {
                            logger.warn("I/O Exception.", e);
                            responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
                        }
                    } else {
                        responseData.setResponseBody(file, false);
                    }
                }
            } else {
                // Forbidden
                responseData.setHttpStatusCode(Constants.FORBIDDEN_STATUS_CODE);
                responseData.setMimeType(APPLICATION_OCTET_STREAM);
            }
        } else if (file.isDirectory()) {
            final Set<RequestData> requestDataSet = new HashSet<>();
            if (includeContent) {
                final File[] files = file.listFiles();
                if (files != null) {
                    for (final File f : files) {
                        final String chileUri = f.toURI().toASCIIString();
                        requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
                    }
                }
            }
            throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
        } else {
            responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
        }
    } catch (final CrawlerSystemException e) {
        CloseableUtil.closeQuietly(responseData);
        throw e;
    } catch (final Exception e) {
        CloseableUtil.closeQuietly(responseData);
        throw new CrawlingAccessException("Could not access " + uri, e);
    }
    return responseData;
}
Also used : FileOwnerAttributeView(java.nio.file.attribute.FileOwnerAttributeView) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) BufferedInputStream(java.io.BufferedInputStream) Date(java.util.Date) URISyntaxException(java.net.URISyntaxException) PosixFileAttributeView(java.nio.file.attribute.PosixFileAttributeView) LoggerFactory(org.slf4j.LoggerFactory) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) AbstractCrawlerClient(org.codelibs.fess.crawler.client.AbstractCrawlerClient) HashSet(java.util.HashSet) UserPrincipal(java.nio.file.attribute.UserPrincipal) URI(java.net.URI) ContentLengthHelper(org.codelibs.fess.crawler.helper.ContentLengthHelper) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) InputStreamUtil(org.codelibs.core.io.InputStreamUtil) AclFileAttributeView(java.nio.file.attribute.AclFileAttributeView) Logger(org.slf4j.Logger) Files(java.nio.file.Files) Resource(javax.annotation.Resource) StringUtil(org.codelibs.core.lang.StringUtil) Set(java.util.Set) FileInputStream(java.io.FileInputStream) FileOwnerAttributeView(java.nio.file.attribute.FileOwnerAttributeView) CrawlerContainer(org.codelibs.fess.crawler.container.CrawlerContainer) File(java.io.File) CloseableUtil(org.codelibs.core.io.CloseableUtil) Constants(org.codelibs.fess.crawler.Constants) URLEncoder(java.net.URLEncoder) RequestData(org.codelibs.fess.crawler.entity.RequestData) AccessTimeoutTarget(org.codelibs.fess.crawler.client.AccessTimeoutTarget) TimeoutManager(org.codelibs.core.timer.TimeoutManager) TimeoutTask(org.codelibs.core.timer.TimeoutTask) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) AclFileAttributeView(java.nio.file.attribute.AclFileAttributeView) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) UserPrincipal(java.nio.file.attribute.UserPrincipal) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) URISyntaxException(java.net.URISyntaxException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Date(java.util.Date) FileInputStream(java.io.FileInputStream) PosixFileAttributeView(java.nio.file.attribute.PosixFileAttributeView) BufferedInputStream(java.io.BufferedInputStream) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) File(java.io.File) HashSet(java.util.HashSet)

Example 3 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.

the class FileSystemClient method doHead.

/*
     * (non-Javadoc)
     *
     * @see org.codelibs.fess.crawler.client.CrawlerClient#doHead(java.lang.String)
     */
@Override
public ResponseData doHead(final String url) {
    try {
        final ResponseData responseData = processRequest(url, false);
        responseData.setMethod(Constants.HEAD_METHOD);
        return responseData;
    } catch (final ChildUrlsException e) {
        return null;
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResponseData(org.codelibs.fess.crawler.entity.ResponseData)

Example 4 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess-crawler by codelibs.

the class FtpClient method getResponseData.

protected ResponseData getResponseData(final String uri, final boolean includeContent) {
    final ResponseData responseData = new ResponseData();
    FTPClient client = null;
    try {
        responseData.setMethod(Constants.GET_METHOD);
        final FtpInfo ftpInfo = new FtpInfo(uri);
        responseData.setUrl(ftpInfo.toUrl());
        client = getClient(ftpInfo);
        FTPFile file = null;
        client.changeWorkingDirectory(ftpInfo.getParent());
        validateRequest(client);
        if (ftpInfo.getName() == null) {
            // root directory
            final Set<RequestData> requestDataSet = new HashSet<>();
            if (includeContent) {
                try {
                    final FTPFile[] files = client.listFiles(ftpInfo.getParent(), FTPFileFilters.NON_NULL);
                    validateRequest(client);
                    for (final FTPFile f : files) {
                        final String chileUri = ftpInfo.toChildUrl(f.getName());
                        requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
                    }
                } catch (final IOException e) {
                    disconnectInternalClient(client);
                    throw new CrawlingAccessException("Could not access " + uri, e);
                }
            }
            ftpClientQueue.offer(client);
            throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
        }
        final FTPFile[] files = client.listFiles(null, FTPFileFilters.NON_NULL);
        validateRequest(client);
        for (final FTPFile f : files) {
            if (ftpInfo.getName().equals(f.getName())) {
                file = f;
                break;
            }
        }
        updateResponseData(uri, includeContent, responseData, client, ftpInfo, file);
    } catch (final CrawlerSystemException e) {
        CloseableUtil.closeQuietly(responseData);
        throw e;
    } catch (final Exception e) {
        CloseableUtil.closeQuietly(responseData);
        throw new CrawlingAccessException("Could not access " + uri, e);
    }
    return responseData;
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) FTPFile(org.apache.commons.net.ftp.FTPFile) IOException(java.io.IOException) FTPClient(org.apache.commons.net.ftp.FTPClient) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerLoginFailureException(org.codelibs.fess.crawler.exception.CrawlerLoginFailureException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) HashSet(java.util.HashSet)

Example 5 with ChildUrlsException

use of org.codelibs.fess.crawler.exception.ChildUrlsException in project fess by codelibs.

the class FessXpathTransformerTest method test_processMetaRobots_noindexnofollow.

public void test_processMetaRobots_noindexnofollow() throws Exception {
    final String data = "<meta name=\"ROBOTS\" content=\"NOINDEX,NOFOLLOW\" />";
    final Document document = getDocument(data);
    final FessXpathTransformer transformer = new FessXpathTransformer() {

        protected Map<String, String> getConfigPrameterMap(final ResponseData responseData, final ConfigName config) {
            return Collections.emptyMap();
        }
    };
    transformer.fessConfig = new FessConfig.SimpleImpl() {

        private static final long serialVersionUID = 1L;

        @Override
        public boolean isCrawlerIgnoreRobotsTags() {
            return false;
        }
    };
    final ResponseData responseData = new ResponseData();
    responseData.setUrl("http://example.com/");
    try {
        transformer.processMetaRobots(responseData, new ResultData(), document);
        fail();
    } catch (ChildUrlsException e) {
        assertTrue(e.getChildUrlList().isEmpty());
    } catch (Exception e) {
        fail();
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) ResultData(org.codelibs.fess.crawler.entity.ResultData) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) Document(org.w3c.dom.Document) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) ComponentNotFoundException(org.lastaflute.di.core.exception.ComponentNotFoundException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException)

Aggregations

ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)24 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)17 RequestData (org.codelibs.fess.crawler.entity.RequestData)11 ResultData (org.codelibs.fess.crawler.entity.ResultData)9 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)9 Set (java.util.Set)8 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)8 HashSet (java.util.HashSet)7 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)7 ConfigName (org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName)7 ComponentNotFoundException (org.lastaflute.di.core.exception.ComponentNotFoundException)7 Map (java.util.Map)6 CrawlerClient (org.codelibs.fess.crawler.client.CrawlerClient)5 BufferedInputStream (java.io.BufferedInputStream)4 IOException (java.io.IOException)4 MalformedURLException (java.net.MalformedURLException)4 HashMap (java.util.HashMap)4 RequestDataBuilder (org.codelibs.fess.crawler.builder.RequestDataBuilder)4 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)4 Document (org.w3c.dom.Document)4