Search in sources :

Example 6 with DataStoreCrawlingException

use of org.codelibs.fess.exception.DataStoreCrawlingException in project fess by codelibs.

the class EsDataStoreImpl method processData.

protected void processData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final long readInterval, final Client client) {
    final boolean deleteProcessedDoc = paramMap.getOrDefault("delete.processed.doc", Constants.FALSE).equalsIgnoreCase(Constants.TRUE);
    final String[] indices;
    if (paramMap.containsKey(INDEX)) {
        indices = paramMap.get(INDEX).trim().split(",");
    } else {
        indices = new String[] { "_all" };
    }
    final String scroll = paramMap.containsKey(SCROLL) ? paramMap.get(SCROLL).trim() : "1m";
    final String timeout = paramMap.containsKey(TIMEOUT) ? paramMap.get(TIMEOUT).trim() : "1m";
    final SearchRequestBuilder builder = client.prepareSearch(indices);
    if (paramMap.containsKey(TYPE)) {
        builder.setTypes(paramMap.get(TYPE).trim().split(","));
    }
    if (paramMap.containsKey(SIZE)) {
        builder.setSize(Integer.parseInt(paramMap.get(SIZE)));
    }
    if (paramMap.containsKey(FIELDS)) {
        builder.setFetchSource(paramMap.get(FIELDS).trim().split(","), null);
    }
    builder.setQuery(QueryBuilders.wrapperQuery(paramMap.containsKey(QUERY) ? paramMap.get(QUERY).trim() : "{\"match_all\":{}}"));
    builder.setScroll(scroll);
    builder.setPreference(paramMap.containsKey(PREFERENCE) ? paramMap.get(PREFERENCE).trim() : Constants.SEARCH_PREFERENCE_PRIMARY);
    try {
        SearchResponse response = builder.execute().actionGet(timeout);
        String scrollId = response.getScrollId();
        while (scrollId != null) {
            final SearchHits searchHits = response.getHits();
            final SearchHit[] hits = searchHits.getHits();
            if (hits.length == 0) {
                scrollId = null;
                break;
            }
            boolean loop = true;
            final BulkRequestBuilder bulkRequest = deleteProcessedDoc ? client.prepareBulk() : null;
            for (final SearchHit hit : hits) {
                if (!alive || !loop) {
                    break;
                }
                final Map<String, Object> dataMap = new HashMap<>();
                dataMap.putAll(defaultDataMap);
                final Map<String, Object> resultMap = new LinkedHashMap<>();
                resultMap.putAll(paramMap);
                resultMap.put("index", hit.getIndex());
                resultMap.put("type", hit.getType());
                resultMap.put("id", hit.getId());
                resultMap.put("version", Long.valueOf(hit.getVersion()));
                resultMap.put("hit", hit);
                resultMap.put("source", hit.getSource());
                resultMap.put("crawlingConfig", dataConfig);
                if (logger.isDebugEnabled()) {
                    for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {
                        logger.debug(entry.getKey() + "=" + entry.getValue());
                    }
                }
                final Map<String, Object> crawlingContext = new HashMap<>();
                crawlingContext.put("doc", dataMap);
                resultMap.put("crawlingContext", crawlingContext);
                for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
                    final Object convertValue = convertValue(entry.getValue(), resultMap);
                    if (convertValue != null) {
                        dataMap.put(entry.getKey(), convertValue);
                    }
                }
                if (logger.isDebugEnabled()) {
                    for (final Map.Entry<String, Object> entry : dataMap.entrySet()) {
                        logger.debug(entry.getKey() + "=" + entry.getValue());
                    }
                }
                try {
                    callback.store(paramMap, dataMap);
                } catch (final CrawlingAccessException e) {
                    logger.warn("Crawling Access Exception at : " + dataMap, e);
                    Throwable target = e;
                    if (target instanceof MultipleCrawlingAccessException) {
                        final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
                        if (causes.length > 0) {
                            target = causes[causes.length - 1];
                        }
                    }
                    String errorName;
                    final Throwable cause = target.getCause();
                    if (cause != null) {
                        errorName = cause.getClass().getCanonicalName();
                    } else {
                        errorName = target.getClass().getCanonicalName();
                    }
                    String url;
                    if (target instanceof DataStoreCrawlingException) {
                        final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
                        url = dce.getUrl();
                        if (dce.aborted()) {
                            loop = false;
                        }
                    } else {
                        url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
                    }
                    final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                    failureUrlService.store(dataConfig, errorName, url, target);
                } catch (final Throwable t) {
                    logger.warn("Crawling Access Exception at : " + dataMap, t);
                    final String url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
                    final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                    failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
                }
                if (bulkRequest != null) {
                    bulkRequest.add(client.prepareDelete(hit.getIndex(), hit.getType(), hit.getId()));
                }
                if (readInterval > 0) {
                    sleep(readInterval);
                }
            }
            if (bulkRequest != null && bulkRequest.numberOfActions() > 0) {
                final BulkResponse bulkResponse = bulkRequest.execute().actionGet(timeout);
                if (bulkResponse.hasFailures()) {
                    logger.warn(bulkResponse.buildFailureMessage());
                }
            }
            if (!alive) {
                break;
            }
            response = client.prepareSearchScroll(scrollId).setScroll(scroll).execute().actionGet(timeout);
            scrollId = response.getScrollId();
        }
    } catch (final Exception e) {
        throw new DataStoreException("Failed to crawl data when acessing elasticsearch.", e);
    }
}
Also used : MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreException(org.codelibs.fess.exception.DataStoreException) SearchRequestBuilder(org.elasticsearch.action.search.SearchRequestBuilder) SearchHit(org.elasticsearch.search.SearchHit) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) BulkResponse(org.elasticsearch.action.bulk.BulkResponse) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DataStoreException(org.codelibs.fess.exception.DataStoreException) SearchResponse(org.elasticsearch.action.search.SearchResponse) LinkedHashMap(java.util.LinkedHashMap) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) SearchHits(org.elasticsearch.search.SearchHits) BulkRequestBuilder(org.elasticsearch.action.bulk.BulkRequestBuilder) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Aggregations

Map (java.util.Map)6 DataStoreCrawlingException (org.codelibs.fess.exception.DataStoreCrawlingException)6 HashMap (java.util.HashMap)3 FailureUrlService (org.codelibs.fess.app.service.FailureUrlService)3 CrawlerClient (org.codelibs.fess.crawler.client.CrawlerClient)3 ChildUrlsException (org.codelibs.fess.crawler.exception.ChildUrlsException)3 FessConfig (org.codelibs.fess.mylasta.direction.FessConfig)3 ArrayList (java.util.ArrayList)2 LinkedHashMap (java.util.LinkedHashMap)2 LinkedList (java.util.LinkedList)2 List (java.util.List)2 ExecutorService (java.util.concurrent.ExecutorService)2 LinkedBlockingQueue (java.util.concurrent.LinkedBlockingQueue)2 ThreadPoolExecutor (java.util.concurrent.ThreadPoolExecutor)2 TimeUnit (java.util.concurrent.TimeUnit)2 Collectors (java.util.stream.Collectors)2 SerializeUtil (org.codelibs.core.io.SerializeUtil)2 StreamUtil.stream (org.codelibs.core.stream.StreamUtil.stream)2 Constants (org.codelibs.fess.Constants)2 RequestDataBuilder (org.codelibs.fess.crawler.builder.RequestDataBuilder)2