Search in sources :

Example 16 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.

the class FessXpathTransformer method storeData.

@Override
protected void storeData(final ResponseData responseData, final ResultData resultData) {
    final DOMParser parser = getDomParser();
    try (final BufferedInputStream bis = new BufferedInputStream(responseData.getResponseBody())) {
        final byte[] bomBytes = new byte[UTF8_BOM_SIZE];
        bis.mark(UTF8_BOM_SIZE);
        final int size = bis.read(bomBytes);
        if (size < 3 || !isUtf8BomBytes(bomBytes)) {
            bis.reset();
        }
        final InputSource is = new InputSource(bis);
        if (responseData.getCharSet() != null) {
            is.setEncoding(responseData.getCharSet());
        }
        parser.parse(is);
    } catch (final Exception e) {
        throw new CrawlingAccessException("Could not parse " + responseData.getUrl(), e);
    }
    final Document document = parser.getDocument();
    processMetaRobots(responseData, resultData, document);
    processXRobotsTag(responseData, resultData);
    final Map<String, Object> dataMap = new LinkedHashMap<>();
    for (final Map.Entry<String, String> entry : fieldRuleMap.entrySet()) {
        final String path = entry.getValue();
        try {
            final XObject xObj = getXPathAPI().eval(document, path);
            final int type = xObj.getType();
            switch(type) {
                case XObject.CLASS_BOOLEAN:
                    final boolean b = xObj.bool();
                    putResultDataBody(dataMap, entry.getKey(), Boolean.toString(b));
                    break;
                case XObject.CLASS_NUMBER:
                    final double d = xObj.num();
                    putResultDataBody(dataMap, entry.getKey(), Double.toString(d));
                    break;
                case XObject.CLASS_STRING:
                    final String str = xObj.str();
                    putResultDataBody(dataMap, entry.getKey(), str);
                    break;
                case XObject.CLASS_NULL:
                case XObject.CLASS_UNKNOWN:
                case XObject.CLASS_NODESET:
                case XObject.CLASS_RTREEFRAG:
                case XObject.CLASS_UNRESOLVEDVARIABLE:
                default:
                    final Boolean isPruned = fieldPrunedRuleMap.get(entry.getKey());
                    Node value = getXPathAPI().selectSingleNode(document, entry.getValue());
                    if (value != null && isPruned != null && isPruned.booleanValue()) {
                        value = pruneNode(value);
                    }
                    putResultDataBody(dataMap, entry.getKey(), value != null ? value.getTextContent() : null);
                    break;
            }
        } catch (final TransformerException e) {
            logger.warn("Could not parse a value of {}:{}", entry.getKey(), entry.getValue(), e);
        }
    }
    putAdditionalData(dataMap, responseData, document);
    normalizeData(responseData, dataMap);
    try {
        resultData.setData(SerializeUtil.fromObjectToBinary(dataMap));
    } catch (final Exception e) {
        throw new CrawlingAccessException("Could not serialize object: " + responseData.getUrl(), e);
    }
    resultData.setEncoding(charsetName);
}
Also used : InputSource(org.xml.sax.InputSource) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Node(org.w3c.dom.Node) Document(org.w3c.dom.Document) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) TransformerException(javax.xml.transform.TransformerException) MalformedURLException(java.net.MalformedURLException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) LinkedHashMap(java.util.LinkedHashMap) BufferedInputStream(java.io.BufferedInputStream) XObject(org.apache.xpath.objects.XObject) DOMParser(org.codelibs.nekohtml.parsers.DOMParser) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) NamedNodeMap(org.w3c.dom.NamedNodeMap) XObject(org.apache.xpath.objects.XObject) TransformerException(javax.xml.transform.TransformerException)

Example 17 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.

the class DatabaseDataStoreImpl method storeData.

@Override
protected void storeData(final DataConfig config, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
    final long readInterval = getReadInterval(paramMap);
    Connection con = null;
    Statement stmt = null;
    ResultSet rs = null;
    try {
        Class.forName(getDriverClass(paramMap));
        final String jdbcUrl = getUrl(paramMap);
        final String username = getUsername(paramMap);
        final String password = getPassword(paramMap);
        if (StringUtil.isNotEmpty(username)) {
            con = DriverManager.getConnection(jdbcUrl, username, password);
        } else {
            con = DriverManager.getConnection(jdbcUrl);
        }
        final String sql = getSql(paramMap);
        stmt = con.createStatement();
        // SQL generated by an administrator
        rs = stmt.executeQuery(sql);
        boolean loop = true;
        while (rs.next() && loop && alive) {
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.putAll(defaultDataMap);
            final Map<String, Object> crawlingContext = new HashMap<>();
            crawlingContext.put("doc", dataMap);
            for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
                final Object convertValue = convertValue(entry.getValue(), new ResultSetParamMap(config, crawlingContext, rs, paramMap));
                if (convertValue != null) {
                    dataMap.put(entry.getKey(), convertValue);
                }
            }
            try {
                callback.store(paramMap, dataMap);
            } catch (final CrawlingAccessException e) {
                logger.warn("Crawling Access Exception at : " + dataMap, e);
                Throwable target = e;
                if (target instanceof MultipleCrawlingAccessException) {
                    final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
                    if (causes.length > 0) {
                        target = causes[causes.length - 1];
                    }
                }
                String errorName;
                final Throwable cause = target.getCause();
                if (cause != null) {
                    errorName = cause.getClass().getCanonicalName();
                } else {
                    errorName = target.getClass().getCanonicalName();
                }
                String url;
                if (target instanceof DataStoreCrawlingException) {
                    final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
                    url = dce.getUrl();
                    if (dce.aborted()) {
                        loop = false;
                    }
                } else {
                    url = sql + ":" + rs.getRow();
                }
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(config, errorName, url, target);
            } catch (final Throwable t) {
                logger.warn("Crawling Access Exception at : " + dataMap, t);
                final String url = sql + ":" + rs.getRow();
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(config, t.getClass().getCanonicalName(), url, t);
            }
            if (readInterval > 0) {
                sleep(readInterval);
            }
        }
    } catch (final Exception e) {
        throw new DataStoreException("Failed to crawl data in DB.", e);
    } finally {
        try {
            if (rs != null) {
                rs.close();
            }
        } catch (final SQLException e) {
            logger.warn("Failed to close a result set.", e);
        } finally {
            try {
                if (stmt != null) {
                    stmt.close();
                }
            } catch (final SQLException e) {
                logger.warn("Failed to close a statement.", e);
            } finally {
                try {
                    if (con != null) {
                        con.close();
                    }
                } catch (final SQLException e) {
                    logger.warn("Failed to close a db connection.", e);
                }
            }
        }
    }
}
Also used : MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreException(org.codelibs.fess.exception.DataStoreException) HashMap(java.util.HashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) SQLException(java.sql.SQLException) Statement(java.sql.Statement) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) Connection(java.sql.Connection) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) FessSystemException(org.codelibs.fess.exception.FessSystemException) DataStoreException(org.codelibs.fess.exception.DataStoreException) SQLException(java.sql.SQLException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) ResultSet(java.sql.ResultSet) HashMap(java.util.HashMap) Map(java.util.Map)

Example 18 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.

the class EsDataStoreImpl method processData.

protected void processData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final long readInterval, final Client client) {
    final boolean deleteProcessedDoc = paramMap.getOrDefault("delete.processed.doc", Constants.FALSE).equalsIgnoreCase(Constants.TRUE);
    final String[] indices;
    if (paramMap.containsKey(INDEX)) {
        indices = paramMap.get(INDEX).trim().split(",");
    } else {
        indices = new String[] { "_all" };
    }
    final String scroll = paramMap.containsKey(SCROLL) ? paramMap.get(SCROLL).trim() : "1m";
    final String timeout = paramMap.containsKey(TIMEOUT) ? paramMap.get(TIMEOUT).trim() : "1m";
    final SearchRequestBuilder builder = client.prepareSearch(indices);
    if (paramMap.containsKey(TYPE)) {
        builder.setTypes(paramMap.get(TYPE).trim().split(","));
    }
    if (paramMap.containsKey(SIZE)) {
        builder.setSize(Integer.parseInt(paramMap.get(SIZE)));
    }
    if (paramMap.containsKey(FIELDS)) {
        builder.setFetchSource(paramMap.get(FIELDS).trim().split(","), null);
    }
    builder.setQuery(QueryBuilders.wrapperQuery(paramMap.containsKey(QUERY) ? paramMap.get(QUERY).trim() : "{\"match_all\":{}}"));
    builder.setScroll(scroll);
    builder.setPreference(paramMap.containsKey(PREFERENCE) ? paramMap.get(PREFERENCE).trim() : Constants.SEARCH_PREFERENCE_PRIMARY);
    try {
        SearchResponse response = builder.execute().actionGet(timeout);
        String scrollId = response.getScrollId();
        while (scrollId != null) {
            final SearchHits searchHits = response.getHits();
            final SearchHit[] hits = searchHits.getHits();
            if (hits.length == 0) {
                scrollId = null;
                break;
            }
            boolean loop = true;
            final BulkRequestBuilder bulkRequest = deleteProcessedDoc ? client.prepareBulk() : null;
            for (final SearchHit hit : hits) {
                if (!alive || !loop) {
                    break;
                }
                final Map<String, Object> dataMap = new HashMap<>();
                dataMap.putAll(defaultDataMap);
                final Map<String, Object> resultMap = new LinkedHashMap<>();
                resultMap.putAll(paramMap);
                resultMap.put("index", hit.getIndex());
                resultMap.put("type", hit.getType());
                resultMap.put("id", hit.getId());
                resultMap.put("version", Long.valueOf(hit.getVersion()));
                resultMap.put("hit", hit);
                resultMap.put("source", hit.getSource());
                resultMap.put("crawlingConfig", dataConfig);
                if (logger.isDebugEnabled()) {
                    for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {
                        logger.debug(entry.getKey() + "=" + entry.getValue());
                    }
                }
                final Map<String, Object> crawlingContext = new HashMap<>();
                crawlingContext.put("doc", dataMap);
                resultMap.put("crawlingContext", crawlingContext);
                for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
                    final Object convertValue = convertValue(entry.getValue(), resultMap);
                    if (convertValue != null) {
                        dataMap.put(entry.getKey(), convertValue);
                    }
                }
                if (logger.isDebugEnabled()) {
                    for (final Map.Entry<String, Object> entry : dataMap.entrySet()) {
                        logger.debug(entry.getKey() + "=" + entry.getValue());
                    }
                }
                try {
                    callback.store(paramMap, dataMap);
                } catch (final CrawlingAccessException e) {
                    logger.warn("Crawling Access Exception at : " + dataMap, e);
                    Throwable target = e;
                    if (target instanceof MultipleCrawlingAccessException) {
                        final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
                        if (causes.length > 0) {
                            target = causes[causes.length - 1];
                        }
                    }
                    String errorName;
                    final Throwable cause = target.getCause();
                    if (cause != null) {
                        errorName = cause.getClass().getCanonicalName();
                    } else {
                        errorName = target.getClass().getCanonicalName();
                    }
                    String url;
                    if (target instanceof DataStoreCrawlingException) {
                        final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
                        url = dce.getUrl();
                        if (dce.aborted()) {
                            loop = false;
                        }
                    } else {
                        url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
                    }
                    final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                    failureUrlService.store(dataConfig, errorName, url, target);
                } catch (final Throwable t) {
                    logger.warn("Crawling Access Exception at : " + dataMap, t);
                    final String url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
                    final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                    failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
                }
                if (bulkRequest != null) {
                    bulkRequest.add(client.prepareDelete(hit.getIndex(), hit.getType(), hit.getId()));
                }
                if (readInterval > 0) {
                    sleep(readInterval);
                }
            }
            if (bulkRequest != null && bulkRequest.numberOfActions() > 0) {
                final BulkResponse bulkResponse = bulkRequest.execute().actionGet(timeout);
                if (bulkResponse.hasFailures()) {
                    logger.warn(bulkResponse.buildFailureMessage());
                }
            }
            if (!alive) {
                break;
            }
            response = client.prepareSearchScroll(scrollId).setScroll(scroll).execute().actionGet(timeout);
            scrollId = response.getScrollId();
        }
    } catch (final Exception e) {
        throw new DataStoreException("Failed to crawl data when acessing elasticsearch.", e);
    }
}
Also used : MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreException(org.codelibs.fess.exception.DataStoreException) SearchRequestBuilder(org.elasticsearch.action.search.SearchRequestBuilder) SearchHit(org.elasticsearch.search.SearchHit) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) BulkResponse(org.elasticsearch.action.bulk.BulkResponse) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DataStoreException(org.codelibs.fess.exception.DataStoreException) SearchResponse(org.elasticsearch.action.search.SearchResponse) LinkedHashMap(java.util.LinkedHashMap) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) SearchHits(org.elasticsearch.search.SearchHits) BulkRequestBuilder(org.elasticsearch.action.bulk.BulkRequestBuilder) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 19 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess by codelibs.

the class AbstractFessFileTransformer method generateData.

protected Map<String, Object> generateData(final ResponseData responseData) {
    final Extractor extractor = getExtractor(responseData);
    final Map<String, String> params = new HashMap<>();
    params.put(TikaMetadataKeys.RESOURCE_NAME_KEY, getResourceName(responseData));
    final String mimeType = responseData.getMimeType();
    params.put(HttpHeaders.CONTENT_TYPE, mimeType);
    params.put(HttpHeaders.CONTENT_ENCODING, responseData.getCharSet());
    final StringBuilder contentMetaBuf = new StringBuilder(1000);
    final Map<String, Object> dataMap = new HashMap<>();
    final Map<String, Object> metaDataMap = new HashMap<>();
    String content;
    try (final InputStream in = responseData.getResponseBody()) {
        final ExtractData extractData = getExtractData(extractor, in, params);
        content = extractData.getContent();
        if (fessConfig.isCrawlerDocumentFileIgnoreEmptyContent() && StringUtil.isBlank(content)) {
            return null;
        }
        if (getLogger().isDebugEnabled()) {
            getLogger().debug("ExtractData: " + extractData);
        }
        // meta
        //
        extractData.getKeySet().stream().filter(//
        k -> extractData.getValues(k) != null).forEach(key -> {
            final String[] values = extractData.getValues(key);
            metaDataMap.put(key, values);
            if (fessConfig.isCrawlerMetadataContentIncluded(key)) {
                final String joinedValue = StringUtils.join(values, ' ');
                if (StringUtil.isNotBlank(joinedValue)) {
                    if (contentMetaBuf.length() > 0) {
                        contentMetaBuf.append(' ');
                    }
                    contentMetaBuf.append(joinedValue.trim());
                }
            }
            final Pair<String, String> mapping = fessConfig.getCrawlerMetadataNameMapping(key);
            if (mapping != null) {
                if (Constants.MAPPING_TYPE_ARRAY.equalsIgnoreCase(mapping.getSecond())) {
                    dataMap.put(mapping.getFirst(), values);
                } else if (Constants.MAPPING_TYPE_STRING.equalsIgnoreCase(mapping.getSecond())) {
                    final String joinedValue = StringUtils.join(values, ' ');
                    dataMap.put(mapping.getFirst(), joinedValue.trim());
                } else if (values.length == 1) {
                    try {
                        if (Constants.MAPPING_TYPE_LONG.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Long.parseLong(values[0]));
                        } else if (Constants.MAPPING_TYPE_DOUBLE.equalsIgnoreCase(mapping.getSecond())) {
                            dataMap.put(mapping.getFirst(), Double.parseDouble(values[0]));
                        } else {
                            logger.warn("Unknown mapping type: {}={}", key, mapping);
                        }
                    } catch (final NumberFormatException e) {
                        logger.warn("Failed to parse " + values[0], e);
                    }
                }
            }
        });
    } catch (final Exception e) {
        final CrawlingAccessException rcae = new CrawlingAccessException("Could not get a text from " + responseData.getUrl(), e);
        rcae.setLogLevel(CrawlingAccessException.WARN);
        throw rcae;
    }
    if (content == null) {
        content = StringUtil.EMPTY;
    }
    final String contentMeta = contentMetaBuf.toString().trim();
    final FessConfig fessConfig = ComponentUtil.getFessConfig();
    final CrawlingInfoHelper crawlingInfoHelper = ComponentUtil.getCrawlingInfoHelper();
    final String sessionId = crawlingInfoHelper.getCanonicalSessionId(responseData.getSessionId());
    final PathMappingHelper pathMappingHelper = ComponentUtil.getPathMappingHelper();
    final CrawlingConfigHelper crawlingConfigHelper = ComponentUtil.getCrawlingConfigHelper();
    final CrawlingConfig crawlingConfig = crawlingConfigHelper.get(responseData.getSessionId());
    final Date documentExpires = crawlingInfoHelper.getDocumentExpires(crawlingConfig);
    final SystemHelper systemHelper = ComponentUtil.getSystemHelper();
    final FileTypeHelper fileTypeHelper = ComponentUtil.getFileTypeHelper();
    final DocumentHelper documentHelper = ComponentUtil.getDocumentHelper();
    String url = responseData.getUrl();
    final String indexingTarget = crawlingConfig.getIndexingTarget(url);
    url = pathMappingHelper.replaceUrl(sessionId, url);
    final Map<String, String> fieldConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.FIELD);
    String urlEncoding;
    final UrlQueue<?> urlQueue = CrawlingParameterUtil.getUrlQueue();
    if (urlQueue != null && urlQueue.getEncoding() != null) {
        urlEncoding = urlQueue.getEncoding();
    } else {
        urlEncoding = responseData.getCharSet();
    }
    // cid
    final String configId = crawlingConfig.getConfigId();
    if (configId != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldConfigId(), configId);
    }
    //  expires
    if (documentExpires != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldExpires(), documentExpires);
    }
    // segment
    putResultDataBody(dataMap, fessConfig.getIndexFieldSegment(), sessionId);
    // content
    final StringBuilder buf = new StringBuilder(content.length() + 1000);
    if (fessConfig.isCrawlerDocumentFileAppendBodyContent()) {
        buf.append(content);
    }
    if (fessConfig.isCrawlerDocumentFileAppendMetaContent()) {
        if (buf.length() > 0) {
            buf.append(' ');
        }
        buf.append(contentMeta);
    }
    final String bodyBase = buf.toString().trim();
    final String body = documentHelper.getContent(responseData, bodyBase, dataMap);
    putResultDataBody(dataMap, fessConfig.getIndexFieldContent(), body);
    if ((Constants.TRUE.equalsIgnoreCase(fieldConfigMap.get(fessConfig.getIndexFieldCache())) || fessConfig.isCrawlerDocumentCacheEnabled()) && fessConfig.isSupportedDocumentCacheMimetypes(mimeType)) {
        if (responseData.getContentLength() > 0 && responseData.getContentLength() <= fessConfig.getCrawlerDocumentCacheMaxSizeAsInteger().longValue()) {
            final String cache = content.trim().replaceAll("[ \\t\\x0B\\f]+", " ");
            // text cache
            putResultDataBody(dataMap, fessConfig.getIndexFieldCache(), cache);
            putResultDataBody(dataMap, fessConfig.getIndexFieldHasCache(), Constants.TRUE);
        }
    }
    // digest
    putResultDataBody(dataMap, fessConfig.getIndexFieldDigest(), documentHelper.getDigest(responseData, bodyBase, dataMap, fessConfig.getCrawlerDocumentFileMaxDigestLengthAsInteger()));
    // title
    final String fileName = getFileName(url, urlEncoding);
    if (!dataMap.containsKey(fessConfig.getIndexFieldTitle())) {
        if (url.endsWith("/")) {
            if (StringUtil.isNotBlank(content)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), documentHelper.getDigest(responseData, body, dataMap, fessConfig.getCrawlerDocumentFileMaxTitleLengthAsInteger()));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fessConfig.getCrawlerDocumentFileNoTitleLabel());
            }
        } else {
            if (StringUtil.isBlank(fileName)) {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), decodeUrlAsName(url, url.startsWith("file:")));
            } else {
                putResultDataBody(dataMap, fessConfig.getIndexFieldTitle(), fileName);
            }
        }
    }
    // host
    putResultDataBody(dataMap, fessConfig.getIndexFieldHost(), getHostOnFile(url));
    // site
    putResultDataBody(dataMap, fessConfig.getIndexFieldSite(), getSiteOnFile(url, urlEncoding));
    // filename
    if (StringUtil.isNotBlank(fileName)) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldFilename(), fileName);
    }
    // url
    putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    // created
    final Date now = systemHelper.getCurrentTime();
    putResultDataBody(dataMap, fessConfig.getIndexFieldCreated(), now);
    // TODO anchor
    putResultDataBody(dataMap, fessConfig.getIndexFieldAnchor(), StringUtil.EMPTY);
    // mimetype
    putResultDataBody(dataMap, fessConfig.getIndexFieldMimetype(), mimeType);
    if (fileTypeHelper != null) {
        // filetype
        putResultDataBody(dataMap, fessConfig.getIndexFieldFiletype(), fileTypeHelper.get(mimeType));
    }
    // content_length
    putResultDataBody(dataMap, fessConfig.getIndexFieldContentLength(), Long.toString(responseData.getContentLength()));
    // last_modified
    final Date lastModified = responseData.getLastModified();
    if (lastModified != null) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLastModified(), lastModified);
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), lastModified);
    } else {
        // timestamp
        putResultDataBody(dataMap, fessConfig.getIndexFieldTimestamp(), now);
    }
    // indexingTarget
    putResultDataBody(dataMap, Constants.INDEXING_TARGET, indexingTarget);
    //  boost
    putResultDataBody(dataMap, fessConfig.getIndexFieldBoost(), crawlingConfig.getDocumentBoost());
    // label: labelType
    final Set<String> labelTypeSet = new HashSet<>();
    for (final String labelType : crawlingConfig.getLabelTypeValues()) {
        labelTypeSet.add(labelType);
    }
    final LabelTypeHelper labelTypeHelper = ComponentUtil.getLabelTypeHelper();
    labelTypeSet.addAll(labelTypeHelper.getMatchedLabelValueSet(url));
    putResultDataBody(dataMap, fessConfig.getIndexFieldLabel(), labelTypeSet);
    // role: roleType
    final List<String> roleTypeList = getRoleTypes(responseData);
    stream(crawlingConfig.getPermissions()).of(stream -> stream.forEach(p -> roleTypeList.add(p)));
    putResultDataBody(dataMap, fessConfig.getIndexFieldRole(), roleTypeList);
    // lang
    if (StringUtil.isNotBlank(fessConfig.getCrawlerDocumentFileDefaultLang())) {
        putResultDataBody(dataMap, fessConfig.getIndexFieldLang(), fessConfig.getCrawlerDocumentFileDefaultLang());
    }
    // id
    putResultDataBody(dataMap, fessConfig.getIndexFieldId(), crawlingInfoHelper.generateId(dataMap));
    // parentId
    String parentUrl = responseData.getParentUrl();
    if (StringUtil.isNotBlank(parentUrl)) {
        parentUrl = pathMappingHelper.replaceUrl(sessionId, parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), parentUrl);
        putResultDataBody(dataMap, fessConfig.getIndexFieldParentId(), crawlingInfoHelper.generateId(dataMap));
        // set again
        putResultDataBody(dataMap, fessConfig.getIndexFieldUrl(), url);
    }
    // from config
    final Map<String, String> scriptConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.SCRIPT);
    final Map<String, String> metaConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.META);
    for (final Map.Entry<String, String> entry : metaConfigMap.entrySet()) {
        final String key = entry.getKey();
        final String[] values = entry.getValue().split(",");
        for (final String value : values) {
            putResultDataWithTemplate(dataMap, key, metaDataMap.get(value), scriptConfigMap.get(key));
        }
    }
    final Map<String, String> valueConfigMap = crawlingConfig.getConfigParameterMap(ConfigName.VALUE);
    for (final Map.Entry<String, String> entry : valueConfigMap.entrySet()) {
        final String key = entry.getKey();
        putResultDataWithTemplate(dataMap, key, entry.getValue(), scriptConfigMap.get(key));
    }
    return dataMap;
}
Also used : CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) URLDecoder(java.net.URLDecoder) Date(java.util.Date) LoggerFactory(org.slf4j.LoggerFactory) Pair(org.codelibs.core.misc.Pair) HashMap(java.util.HashMap) TikaMetadataKeys(org.apache.tika.metadata.TikaMetadataKeys) SerializeUtil(org.codelibs.core.io.SerializeUtil) CrawlingParameterUtil(org.codelibs.fess.crawler.util.CrawlingParameterUtil) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) StringUtils(org.apache.commons.lang3.StringUtils) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) ConfigName(org.codelibs.fess.es.config.exentity.CrawlingConfig.ConfigName) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) Map(java.util.Map) AccessResultData(org.codelibs.fess.crawler.entity.AccessResultData) AbstractTransformer(org.codelibs.fess.crawler.transformer.impl.AbstractTransformer) SambaHelper(org.codelibs.fess.helper.SambaHelper) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) ResultData(org.codelibs.fess.crawler.entity.ResultData) Extractor(org.codelibs.fess.crawler.extractor.Extractor) StringUtil(org.codelibs.core.lang.StringUtil) SID(jcifs.smb.SID) Set(java.util.Set) List(java.util.List) ACE(jcifs.smb.ACE) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) ComponentUtil(org.codelibs.fess.util.ComponentUtil) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) HttpHeaders(org.apache.tika.metadata.HttpHeaders) SmbClient(org.codelibs.fess.crawler.client.smb.SmbClient) UrlQueue(org.codelibs.fess.crawler.entity.UrlQueue) InputStream(java.io.InputStream) ResponseData(org.codelibs.fess.crawler.entity.ResponseData) HashMap(java.util.HashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlingConfig(org.codelibs.fess.es.config.exentity.CrawlingConfig) PathMappingHelper(org.codelibs.fess.helper.PathMappingHelper) CrawlingInfoHelper(org.codelibs.fess.helper.CrawlingInfoHelper) Extractor(org.codelibs.fess.crawler.extractor.Extractor) HashSet(java.util.HashSet) ExtractData(org.codelibs.fess.crawler.entity.ExtractData) DocumentHelper(org.codelibs.fess.helper.DocumentHelper) InputStream(java.io.InputStream) LabelTypeHelper(org.codelibs.fess.helper.LabelTypeHelper) FessConfig(org.codelibs.fess.mylasta.direction.FessConfig) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) Date(java.util.Date) CrawlingConfigHelper(org.codelibs.fess.helper.CrawlingConfigHelper) SystemHelper(org.codelibs.fess.helper.SystemHelper) FileTypeHelper(org.codelibs.fess.helper.FileTypeHelper) HashMap(java.util.HashMap) Map(java.util.Map)

Example 20 with CrawlingAccessException

use of org.codelibs.fess.crawler.exception.CrawlingAccessException in project fess-crawler by codelibs.

the class FtpClient method updateResponseData.

protected void updateResponseData(final String uri, final boolean includeContent, final ResponseData responseData, FTPClient client, final FtpInfo ftpInfo, FTPFile file) {
    if (file == null) {
        responseData.setHttpStatusCode(Constants.NOT_FOUND_STATUS_CODE);
        responseData.setCharSet(charset);
        responseData.setContentLength(0);
        ftpClientQueue.offer(client);
        return;
    }
    if (file.isSymbolicLink()) {
        final String link = file.getLink();
        String redirect = null;
        if (link == null) {
            responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
            ftpClientQueue.offer(client);
            return;
        } else if (link.startsWith("/")) {
            redirect = ftpInfo.toUrl(file.getLink());
        } else if (link.startsWith("../")) {
            redirect = ftpInfo.toChildUrl(file.getLink());
        } else {
            redirect = ftpInfo.toChildUrl("../" + file.getLink());
        }
        if (!uri.equals(redirect)) {
            responseData.setHttpStatusCode(Constants.OK_STATUS);
            responseData.setCharSet(charset);
            responseData.setContentLength(0);
            responseData.setRedirectLocation(redirect);
            ftpClientQueue.offer(client);
            return;
        }
    }
    if (file.isFile()) {
        responseData.setHttpStatusCode(Constants.OK_STATUS_CODE);
        responseData.setCharSet(Constants.UTF_8);
        responseData.setLastModified(file.getTimestamp().getTime());
        // check file size
        responseData.setContentLength(file.getSize());
        checkMaxContentLength(responseData);
        if (file.getUser() != null) {
            responseData.addMetaData(FTP_FILE_USER, file.getUser());
        }
        if (file.getGroup() != null) {
            responseData.addMetaData(FTP_FILE_GROUP, file.getGroup());
        }
        if (includeContent) {
            File tempFile = null;
            File outputFile = null;
            try {
                tempFile = File.createTempFile("ftp-", ".tmp");
                try (OutputStream out = new BufferedOutputStream(new FileOutputStream(tempFile))) {
                    if (!client.retrieveFile(ftpInfo.getName(), out)) {
                        throw new CrawlingAccessException("Failed to retrieve: " + ftpInfo.toUrl());
                    }
                }
                final MimeTypeHelper mimeTypeHelper = crawlerContainer.getComponent("mimeTypeHelper");
                try (InputStream is = new FileInputStream(tempFile)) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(is, file.getName()));
                } catch (final Exception e) {
                    responseData.setMimeType(mimeTypeHelper.getContentType(null, file.getName()));
                }
                if (contentLengthHelper != null) {
                    final long maxLength = contentLengthHelper.getMaxLength(responseData.getMimeType());
                    if (responseData.getContentLength() > maxLength) {
                        throw new MaxLengthExceededException("The content length (" + responseData.getContentLength() + " byte) is over " + maxLength + " byte. The url is " + uri);
                    }
                }
                responseData.setCharSet(geCharSet(tempFile));
                if (tempFile.length() < maxCachedContentSize) {
                    try (InputStream contentStream = new BufferedInputStream(new FileInputStream(tempFile))) {
                        responseData.setResponseBody(InputStreamUtil.getBytes(contentStream));
                    }
                } else {
                    outputFile = File.createTempFile("crawler-FtpClient-", ".out");
                    CopyUtil.copy(tempFile, outputFile);
                    responseData.setResponseBody(outputFile, true);
                }
                ftpClientQueue.offer(client);
            } catch (final CrawlingAccessException e) {
                ftpClientQueue.offer(client);
                throw e;
            } catch (final Exception e) {
                logger.warn("I/O Exception.", e);
                disconnectInternalClient(client);
                responseData.setHttpStatusCode(Constants.SERVER_ERROR_STATUS_CODE);
            } finally {
                if (tempFile != null && !tempFile.delete()) {
                    logger.warn("Could not delete " + tempFile.getAbsolutePath());
                }
            }
        }
    } else if (file.isDirectory() || file.isSymbolicLink()) {
        final Set<RequestData> requestDataSet = new HashSet<>();
        if (includeContent) {
            try {
                final FTPFile[] ftpFiles = client.listFiles(ftpInfo.getName(), FTPFileFilters.NON_NULL);
                validateRequest(client);
                for (final FTPFile f : ftpFiles) {
                    final String chileUri = ftpInfo.toChildUrl(f.getName());
                    requestDataSet.add(RequestDataBuilder.newRequestData().get().url(chileUri).build());
                }
            } catch (final IOException e) {
                disconnectInternalClient(client);
                throw new CrawlingAccessException("Could not access " + uri, e);
            }
        }
        ftpClientQueue.offer(client);
        throw new ChildUrlsException(requestDataSet, this.getClass().getName() + "#getResponseData");
    } else {
        responseData.setHttpStatusCode(Constants.BAD_REQUEST_STATUS_CODE);
        responseData.setCharSet(charset);
        responseData.setContentLength(0);
        ftpClientQueue.offer(client);
    }
}
Also used : ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) HashSet(java.util.HashSet) Set(java.util.Set) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) MimeTypeHelper(org.codelibs.fess.crawler.helper.MimeTypeHelper) BufferedInputStream(java.io.BufferedInputStream) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) BufferedOutputStream(java.io.BufferedOutputStream) OutputStream(java.io.OutputStream) FileOutputStream(java.io.FileOutputStream) FTPFile(org.apache.commons.net.ftp.FTPFile) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) CrawlerLoginFailureException(org.codelibs.fess.crawler.exception.CrawlerLoginFailureException) MaxLengthExceededException(org.codelibs.fess.crawler.exception.MaxLengthExceededException) CrawlerSystemException(org.codelibs.fess.crawler.exception.CrawlerSystemException) MalformedURLException(java.net.MalformedURLException) IOException(java.io.IOException) ChildUrlsException(org.codelibs.fess.crawler.exception.ChildUrlsException) BufferedInputStream(java.io.BufferedInputStream) FileOutputStream(java.io.FileOutputStream) File(java.io.File) FTPFile(org.apache.commons.net.ftp.FTPFile) BufferedOutputStream(java.io.BufferedOutputStream)

Aggregations

CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)36 CrawlerSystemException (org.codelibs.fess.crawler.exception.CrawlerSystemException)14 InputStream (java.io.InputStream)13 Map (java.util.Map)9 IOException (java.io.IOException)8 ResponseData (org.codelibs.fess.crawler.entity.ResponseData)8 BufferedInputStream (java.io.BufferedInputStream)7 HashMap (java.util.HashMap)7 HashSet (java.util.HashSet)7 ResultData (org.codelibs.fess.crawler.entity.ResultData)7 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)7 MalformedURLException (java.net.MalformedURLException)6 AccessResultData (org.codelibs.fess.crawler.entity.AccessResultData)6 MaxLengthExceededException (org.codelibs.fess.crawler.exception.MaxLengthExceededException)6 ByteArrayInputStream (java.io.ByteArrayInputStream)5 File (java.io.File)5 LinkedHashMap (java.util.LinkedHashMap)5 FileInputStream (java.io.FileInputStream)4 UnsupportedEncodingException (java.io.UnsupportedEncodingException)4 Date (java.util.Date)4