Search in sources :

Example 1 with CrawlerSystemException

use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess by codelibs.

the class FileListIndexUpdateCallbackImpl method processRequest.

protected String processRequest(final Map<String, String> paramMap, final Map<String, Object> dataMap, final String url, final CrawlerClient client) {
    final long startTime = System.currentTimeMillis();
    try (final ResponseData responseData = client.execute(RequestDataBuilder.newRequestData().get().url(url).build())) {
        if (responseData.getRedirectLocation() != null) {
            return responseData.getRedirectLocation();
        }
        responseData.setExecutionTime(System.currentTimeMillis() - startTime);
        if (dataMap.containsKey(Constants.SESSION_ID)) {
            responseData.setSessionId((String) dataMap.get(Constants.SESSION_ID));
        } else {
            responseData.setSessionId(paramMap.get(Constants.CRAWLING_INFO_ID));
        }
        final RuleManager ruleManager = SingletonLaContainer.getComponent(RuleManager.class);
        final Rule rule = ruleManager.getRule(responseData);
        if (rule == null) {
            logger.warn("No url rule. Data: " + dataMap);
        } else {
            responseData.setRuleId(rule.getRuleId());
            final ResponseProcessor responseProcessor = rule.getResponseProcessor();
            if (responseProcessor instanceof DefaultResponseProcessor) {
                final Transformer transformer = ((DefaultResponseProcessor) responseProcessor).getTransformer();
                final ResultData resultData = transformer.transform(responseData);
                final byte[] data = resultData.getData();
                if (data != null) {
                    try {
                        @SuppressWarnings("unchecked") final Map<String, Object> responseDataMap = (Map<String, Object>) SerializeUtil.fromBinaryToObject(data);
                        dataMap.putAll(responseDataMap);
                    } catch (final Exception e) {
                        throw new CrawlerSystemException("Could not create an instance from bytes.", e);
                    }
                }
                // remove
                String[] ignoreFields;
                if (paramMap.containsKey("ignore.field.names")) {
                    ignoreFields = paramMap.get("ignore.field.names").split(",");
                } else {
                    ignoreFields = new String[] { Constants.INDEXING_TARGET, Constants.SESSION_ID };
                }
                stream(ignoreFields).of(stream -> stream.map(s -> s.trim()).forEach(s -> dataMap.remove(s)));
                indexUpdateCallback.store(paramMap, dataMap);
            } else {
                logger.warn("The response processor is not DefaultResponseProcessor. responseProcessor: " + responseProcessor + ", Data: " + dataMap);
            }
        }
        return null;
    } catch (final ChildUrlsException e) {
        throw new DataStoreCrawlingException(url, "Redirected to " + e.getChildUrlList().stream().map(r -> r.getUrl()).collect(Collectors.joining(", ")), e);
    } catch (final Exception e) {
        throw new DataStoreCrawlingException(url, "Failed to add: " + dataMap, e);
    }
}
Also used : Constants(org.codelibs.fess.Constants) IndexingHelper(org.codelibs.fess.helper.IndexingHelper) ThreadPoolExecutor(java.util.concurrent.ThreadPoolExecutor) LoggerFactory(org.slf4j.LoggerFactory) SerializeUtil(org.codelibs.core.io.SerializeUtil) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) IndexUpdateCallback(org.codelibs.fess.ds.IndexUpdateCallback) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ArrayList(java.util.ArrayList) CrawlerClient(org.codelibs.fess.crawler.client.CrawlerClient) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) Map(java.util.Map) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) ExecutorService(java.util.concurrent.ExecutorService) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) ResultData(org.codelibs.fess.crawler.entity.ResultData) FessEsClient(org.codelibs.fess.es.client.FessEsClient) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) LinkedBlockingQueue(java.util.concurrent.LinkedBlockingQueue) Collectors(java.util.stream.Collectors) TimeUnit(java.util.concurrent.TimeUnit) List(java.util.List) ComponentUtil(org.codelibs.fess.util.ComponentUtil) SingletonLaContainer(org.lastaflute.di.core.SingletonLaContainer) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) RequestDataBuilder(org.codelibs.fess.crawler.builder.RequestDataBuilder) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) Transformer(org.codelibs.fess.crawler.transformer.Transformer) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) ResponseProcessor(org.codelibs.fess.crawler.processor.ResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) ResultData(org.codelibs.fess.crawler.entity.ResultData) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DefaultResponseProcessor(org.codelibs.fess.crawler.processor.impl.DefaultResponseProcessor) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) RuleManager(org.codelibs.fess.crawler.rule.RuleManager) Rule(org.codelibs.fess.crawler.rule.Rule) Map(java.util.Map)

Example 2 with CrawlerSystemException

use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.

the class FileTransformer method storeData.

@Override
public void storeData(final ResponseData responseData, final ResultData resultData) {
    resultData.setTransformerName(getName());
    initBaseDir();
    final String url = responseData.getUrl();
    final String path = getFilePath(url);
    synchronized (this) {
        final File file = createFile(path);
        try (final InputStream is = responseData.getResponseBody();
            final OutputStream os = new FileOutputStream(file)) {
            CopyUtil.copy(is, os);
        } catch (final IOException e) {
            throw new CrawlerSystemException("Could not store " + file.getAbsolutePath(), e);
        }
    }
    try {
        resultData.setData(path.getBytes(charsetName));
    } catch (final UnsupportedEncodingException e) {
        if (logger.isInfoEnabled()) {
            logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
        }
        charsetName = Constants.UTF_8_CHARSET.name();
        resultData.setData(path.getBytes(Constants.UTF_8_CHARSET));
    }
    resultData.setEncoding(charsetName);
}
Also used : InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) FileOutputStream(java.io.FileOutputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IOException(java.io.IOException) File(java.io.File)

Example 3 with CrawlerSystemException

use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.

the class FileTransformer method createFile.

protected File createFile(final String path) {
    final String[] paths = path.split("/");
    File targetFile = baseDir;
    for (int i = 0; i < paths.length - 1; i++) {
        File file = new File(targetFile, paths[i]);
        if (file.exists()) {
            if (!file.isDirectory()) {
                for (int j = 0; j < maxDuplicatedPath; j++) {
                    file = new File(targetFile, paths[i] + "_" + j);
                    if (file.exists()) {
                        if (file.isDirectory()) {
                            break;
                        }
                    } else {
                        if (!file.mkdirs()) {
                            throw new CrawlerSystemException("Could not create " + file.getAbsolutePath());
                        }
                        break;
                    }
                }
            }
        } else {
            if (!file.mkdirs()) {
                throw new CrawlerSystemException("Could not create " + file.getAbsolutePath());
            }
        }
        targetFile = file;
    }
    File file = new File(targetFile, paths[paths.length - 1]);
    if (file.exists()) {
        for (int i = 0; i < maxDuplicatedPath; i++) {
            file = new File(targetFile, paths[paths.length - 1] + "_" + i);
            if (!file.exists()) {
                targetFile = file;
                break;
            }
        }
    } else {
        targetFile = file;
    }
    return targetFile;
}
Also used : CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) File(java.io.File)

Example 4 with CrawlerSystemException

use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.

the class HtmlTransformer method storeChildUrls.

protected void storeChildUrls(final ResponseData responseData, final ResultData resultData) {
    List<RequestData> requestDataList = new ArrayList<>();
    try (final InputStream is = responseData.getResponseBody()) {
        final DOMParser parser = getDomParser();
        parser.parse(new InputSource(is));
        final Document document = parser.getDocument();
        // base href
        final String baseHref = getBaseHref(document);
        URL url;
        try {
            url = new URL(baseHref == null ? responseData.getUrl() : baseHref);
        } catch (final MalformedURLException e) {
            url = new URL(responseData.getUrl());
        }
        for (final Map.Entry<String, String> entry : childUrlRuleMap.entrySet()) {
            for (final String childUrl : getUrlFromTagAttribute(url, document, entry.getKey(), entry.getValue(), responseData.getCharSet())) {
                requestDataList.add(RequestDataBuilder.newRequestData().get().url(childUrl).build());
            }
        }
        requestDataList = convertChildUrlList(requestDataList);
        resultData.addAllUrl(requestDataList);
        resultData.addAllUrl(responseData.getChildUrlSet());
        final RequestData requestData = responseData.getRequestData();
        resultData.removeUrl(requestData);
        resultData.removeUrl(getDuplicateUrl(requestData));
    } catch (final CrawlerSystemException e) {
        throw e;
    } catch (final Exception e) {
        throw new CrawlerSystemException("Could not store data.", e);
    }
}
Also used : InputSource(org.xml.sax.InputSource) MalformedURLException(java.net.MalformedURLException) BufferedInputStream(java.io.BufferedInputStream) InputStream(java.io.InputStream) ArrayList(java.util.ArrayList) Document(org.w3c.dom.Document) URL(java.net.URL) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) TransformerException(javax.xml.transform.TransformerException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) RequestData(org.codelibs.fess.crawler.entity.RequestData) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) DOMParser(org.cyberneko.html.parsers.DOMParser) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 5 with CrawlerSystemException

use of org.codelibs.fess.crawler.exception.CrawlerSystemException in project fess-crawler by codelibs.

the class TextTransformer method transform.

/*
     * (non-Javadoc)
     *
     * @see
     * org.codelibs.fess.crawler.transformer.impl.AbstractTransformer#transform(org.fess.crawler.entity.ResponseData)
     */
@Override
public ResultData transform(final ResponseData responseData) {
    if (responseData == null || !responseData.hasResponseBody()) {
        throw new CrawlingAccessException("No response body.");
    }
    final ExtractorFactory extractorFactory = crawlerContainer.getComponent("extractorFactory");
    if (extractorFactory == null) {
        throw new CrawlerSystemException("Could not find extractorFactory.");
    }
    final Extractor extractor = extractorFactory.getExtractor(responseData.getMimeType());
    final Map<String, String> params = new HashMap<>();
    params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
    params.put(HttpHeaders.CONTENT_TYPE, responseData.getMimeType());
    String content = null;
    try (final InputStream in = responseData.getResponseBody()) {
        content = extractor.getText(in, params).getContent();
    } catch (final Exception e) {
        throw new CrawlingAccessException("Could not extract data.", e);
    }
    final ResultData resultData = new ResultData();
    resultData.setTransformerName(getName());
    try {
        resultData.setData(content.getBytes(charsetName));
    } catch (final UnsupportedEncodingException e) {
        if (logger.isInfoEnabled()) {
            logger.info("Invalid charsetName: " + charsetName + ". Changed to " + Constants.UTF_8, e);
        }
        charsetName = Constants.UTF_8_CHARSET.name();
        resultData.setData(content.getBytes(Constants.UTF_8_CHARSET));
    }
    resultData.setEncoding(charsetName);
    return resultData;
}
Also used : ResultData(org.codelibs.fess.crawler.entity.ResultData) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) HashMap(java.util.HashMap) ExtractorFactory(org.codelibs.fess.crawler.extractor.ExtractorFactory) InputStream(java.io.InputStream) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) Extractor(org.codelibs.fess.crawler.extractor.Extractor) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) UnsupportedEncodingException(java.io.UnsupportedEncodingException)

Aggregations

CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)41 IOException (java.io.IOException)16 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)13 File (java.io.File)11 InputStream (java.io.InputStream)11 UnsupportedEncodingException (java.io.UnsupportedEncodingException)10 BufferedInputStream (java.io.BufferedInputStream)9 ExtractException (org.codelibs.fess.crawler.exception.ExtractException)9 ExtractData (org.codelibs.fess.crawler.entity.ExtractData)8 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)8 Map (java.util.Map)7 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)7 MalformedURLException (java.net.MalformedURLException)6 HashMap (java.util.HashMap)6 AccessResultDataImpl (org.codelibs.fess.crawler.entity.AccessResultDataImpl)6 RequestData (org.codelibs.fess.crawler.entity.RequestData)6 ResultData (org.codelibs.fess.crawler.entity.ResultData)6 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)6 HashSet (java.util.HashSet)5 TransformerException (javax.xml.transform.TransformerException)5