Search in sources :

Example 1 with MultipleCrawlingAccessException

use of org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException in project fess by codelibs.

the class CsvDataStoreImpl method processCsv.

protected void processCsv(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final CsvConfig csvConfig, final File csvFile, final long readInterval, final String csvFileEncoding, final boolean hasHeaderLine) {
    logger.info("Loading " + csvFile.getAbsolutePath());
    CsvReader csvReader = null;
    try {
        csvReader = new CsvReader(new BufferedReader(new InputStreamReader(new FileInputStream(csvFile), csvFileEncoding)), csvConfig);
        List<String> headerList = null;
        if (hasHeaderLine) {
            headerList = csvReader.readValues();
        }
        List<String> list;
        boolean loop = true;
        while ((list = csvReader.readValues()) != null && loop && alive) {
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.putAll(defaultDataMap);
            final Map<String, Object> resultMap = new LinkedHashMap<>();
            resultMap.putAll(paramMap);
            resultMap.put("csvfile", csvFile.getAbsolutePath());
            resultMap.put("csvfilename", csvFile.getName());
            resultMap.put("crawlingConfig", dataConfig);
            boolean foundValues = false;
            for (int i = 0; i < list.size(); i++) {
                String key = null;
                String value = list.get(i);
                if (value == null) {
                    value = StringUtil.EMPTY;
                }
                if (StringUtil.isNotBlank(value)) {
                    foundValues = true;
                }
                if (headerList != null && headerList.size() > i) {
                    key = headerList.get(i);
                    if (StringUtil.isNotBlank(key)) {
                        resultMap.put(key, value);
                    }
                }
                key = CELL_PREFIX + Integer.toString(i + 1);
                resultMap.put(key, value);
            }
            if (!foundValues) {
                logger.debug("No data in line: {}", resultMap);
                continue;
            }
            if (logger.isDebugEnabled()) {
                for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {
                    logger.debug(entry.getKey() + "=" + entry.getValue());
                }
            }
            final Map<String, Object> crawlingContext = new HashMap<>();
            crawlingContext.put("doc", dataMap);
            resultMap.put("crawlingContext", crawlingContext);
            for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
                final Object convertValue = convertValue(entry.getValue(), resultMap);
                if (convertValue != null) {
                    dataMap.put(entry.getKey(), convertValue);
                }
            }
            if (logger.isDebugEnabled()) {
                for (final Map.Entry<String, Object> entry : dataMap.entrySet()) {
                    logger.debug(entry.getKey() + "=" + entry.getValue());
                }
            }
            try {
                callback.store(paramMap, dataMap);
            } catch (final CrawlingAccessException e) {
                logger.warn("Crawling Access Exception at : " + dataMap, e);
                Throwable target = e;
                if (target instanceof MultipleCrawlingAccessException) {
                    final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
                    if (causes.length > 0) {
                        target = causes[causes.length - 1];
                    }
                }
                String errorName;
                final Throwable cause = target.getCause();
                if (cause != null) {
                    errorName = cause.getClass().getCanonicalName();
                } else {
                    errorName = target.getClass().getCanonicalName();
                }
                String url;
                if (target instanceof DataStoreCrawlingException) {
                    final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
                    url = dce.getUrl();
                    if (dce.aborted()) {
                        loop = false;
                    }
                } else {
                    url = csvFile.getAbsolutePath() + ":" + csvReader.getLineNumber();
                }
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(dataConfig, errorName, url, target);
            } catch (final Throwable t) {
                logger.warn("Crawling Access Exception at : " + dataMap, t);
                final String url = csvFile.getAbsolutePath() + ":" + csvReader.getLineNumber();
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
            }
            if (readInterval > 0) {
                sleep(readInterval);
            }
        }
    } catch (final Exception e) {
        throw new DataStoreException("Failed to crawl data when reading csv file.", e);
    } finally {
        IOUtils.closeQuietly(csvReader);
    }
}
Also used : MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreException(org.codelibs.fess.exception.DataStoreException) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) FileInputStream(java.io.FileInputStream) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DataStoreException(org.codelibs.fess.exception.DataStoreException) LinkedHashMap(java.util.LinkedHashMap) CsvReader(com.orangesignal.csv.CsvReader) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) BufferedReader(java.io.BufferedReader) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 2 with MultipleCrawlingAccessException

use of org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException in project fess by codelibs.

the class DatabaseDataStoreImpl method storeData.

@Override
protected void storeData(final DataConfig config, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
    final long readInterval = getReadInterval(paramMap);
    Connection con = null;
    Statement stmt = null;
    ResultSet rs = null;
    try {
        Class.forName(getDriverClass(paramMap));
        final String jdbcUrl = getUrl(paramMap);
        final String username = getUsername(paramMap);
        final String password = getPassword(paramMap);
        if (StringUtil.isNotEmpty(username)) {
            con = DriverManager.getConnection(jdbcUrl, username, password);
        } else {
            con = DriverManager.getConnection(jdbcUrl);
        }
        final String sql = getSql(paramMap);
        stmt = con.createStatement();
        // SQL generated by an administrator
        rs = stmt.executeQuery(sql);
        boolean loop = true;
        while (rs.next() && loop && alive) {
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.putAll(defaultDataMap);
            final Map<String, Object> crawlingContext = new HashMap<>();
            crawlingContext.put("doc", dataMap);
            for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
                final Object convertValue = convertValue(entry.getValue(), new ResultSetParamMap(config, crawlingContext, rs, paramMap));
                if (convertValue != null) {
                    dataMap.put(entry.getKey(), convertValue);
                }
            }
            try {
                callback.store(paramMap, dataMap);
            } catch (final CrawlingAccessException e) {
                logger.warn("Crawling Access Exception at : " + dataMap, e);
                Throwable target = e;
                if (target instanceof MultipleCrawlingAccessException) {
                    final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
                    if (causes.length > 0) {
                        target = causes[causes.length - 1];
                    }
                }
                String errorName;
                final Throwable cause = target.getCause();
                if (cause != null) {
                    errorName = cause.getClass().getCanonicalName();
                } else {
                    errorName = target.getClass().getCanonicalName();
                }
                String url;
                if (target instanceof DataStoreCrawlingException) {
                    final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
                    url = dce.getUrl();
                    if (dce.aborted()) {
                        loop = false;
                    }
                } else {
                    url = sql + ":" + rs.getRow();
                }
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(config, errorName, url, target);
            } catch (final Throwable t) {
                logger.warn("Crawling Access Exception at : " + dataMap, t);
                final String url = sql + ":" + rs.getRow();
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(config, t.getClass().getCanonicalName(), url, t);
            }
            if (readInterval > 0) {
                sleep(readInterval);
            }
        }
    } catch (final Exception e) {
        throw new DataStoreException("Failed to crawl data in DB.", e);
    } finally {
        try {
            if (rs != null) {
                rs.close();
            }
        } catch (final SQLException e) {
            logger.warn("Failed to close a result set.", e);
        } finally {
            try {
                if (stmt != null) {
                    stmt.close();
                }
            } catch (final SQLException e) {
                logger.warn("Failed to close a statement.", e);
            } finally {
                try {
                    if (con != null) {
                        con.close();
                    }
                } catch (final SQLException e) {
                    logger.warn("Failed to close a db connection.", e);
                }
            }
        }
    }
}
Also used : MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreException(org.codelibs.fess.exception.DataStoreException) HashMap(java.util.HashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) SQLException(java.sql.SQLException) Statement(java.sql.Statement) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) Connection(java.sql.Connection) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) FessSystemException(org.codelibs.fess.exception.FessSystemException) DataStoreException(org.codelibs.fess.exception.DataStoreException) SQLException(java.sql.SQLException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) ResultSet(java.sql.ResultSet) HashMap(java.util.HashMap) Map(java.util.Map)

Example 3 with MultipleCrawlingAccessException

use of org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException in project fess by codelibs.

the class EsDataStoreImpl method processData.

protected void processData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final long readInterval, final Client client) {
    final boolean deleteProcessedDoc = paramMap.getOrDefault("delete.processed.doc", Constants.FALSE).equalsIgnoreCase(Constants.TRUE);
    final String[] indices;
    if (paramMap.containsKey(INDEX)) {
        indices = paramMap.get(INDEX).trim().split(",");
    } else {
        indices = new String[] { "_all" };
    }
    final String scroll = paramMap.containsKey(SCROLL) ? paramMap.get(SCROLL).trim() : "1m";
    final String timeout = paramMap.containsKey(TIMEOUT) ? paramMap.get(TIMEOUT).trim() : "1m";
    final SearchRequestBuilder builder = client.prepareSearch(indices);
    if (paramMap.containsKey(TYPE)) {
        builder.setTypes(paramMap.get(TYPE).trim().split(","));
    }
    if (paramMap.containsKey(SIZE)) {
        builder.setSize(Integer.parseInt(paramMap.get(SIZE)));
    }
    if (paramMap.containsKey(FIELDS)) {
        builder.setFetchSource(paramMap.get(FIELDS).trim().split(","), null);
    }
    builder.setQuery(QueryBuilders.wrapperQuery(paramMap.containsKey(QUERY) ? paramMap.get(QUERY).trim() : "{\"match_all\":{}}"));
    builder.setScroll(scroll);
    builder.setPreference(paramMap.containsKey(PREFERENCE) ? paramMap.get(PREFERENCE).trim() : Constants.SEARCH_PREFERENCE_PRIMARY);
    try {
        SearchResponse response = builder.execute().actionGet(timeout);
        String scrollId = response.getScrollId();
        while (scrollId != null) {
            final SearchHits searchHits = response.getHits();
            final SearchHit[] hits = searchHits.getHits();
            if (hits.length == 0) {
                scrollId = null;
                break;
            }
            boolean loop = true;
            final BulkRequestBuilder bulkRequest = deleteProcessedDoc ? client.prepareBulk() : null;
            for (final SearchHit hit : hits) {
                if (!alive || !loop) {
                    break;
                }
                final Map<String, Object> dataMap = new HashMap<>();
                dataMap.putAll(defaultDataMap);
                final Map<String, Object> resultMap = new LinkedHashMap<>();
                resultMap.putAll(paramMap);
                resultMap.put("index", hit.getIndex());
                resultMap.put("type", hit.getType());
                resultMap.put("id", hit.getId());
                resultMap.put("version", Long.valueOf(hit.getVersion()));
                resultMap.put("hit", hit);
                resultMap.put("source", hit.getSource());
                resultMap.put("crawlingConfig", dataConfig);
                if (logger.isDebugEnabled()) {
                    for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {
                        logger.debug(entry.getKey() + "=" + entry.getValue());
                    }
                }
                final Map<String, Object> crawlingContext = new HashMap<>();
                crawlingContext.put("doc", dataMap);
                resultMap.put("crawlingContext", crawlingContext);
                for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
                    final Object convertValue = convertValue(entry.getValue(), resultMap);
                    if (convertValue != null) {
                        dataMap.put(entry.getKey(), convertValue);
                    }
                }
                if (logger.isDebugEnabled()) {
                    for (final Map.Entry<String, Object> entry : dataMap.entrySet()) {
                        logger.debug(entry.getKey() + "=" + entry.getValue());
                    }
                }
                try {
                    callback.store(paramMap, dataMap);
                } catch (final CrawlingAccessException e) {
                    logger.warn("Crawling Access Exception at : " + dataMap, e);
                    Throwable target = e;
                    if (target instanceof MultipleCrawlingAccessException) {
                        final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
                        if (causes.length > 0) {
                            target = causes[causes.length - 1];
                        }
                    }
                    String errorName;
                    final Throwable cause = target.getCause();
                    if (cause != null) {
                        errorName = cause.getClass().getCanonicalName();
                    } else {
                        errorName = target.getClass().getCanonicalName();
                    }
                    String url;
                    if (target instanceof DataStoreCrawlingException) {
                        final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
                        url = dce.getUrl();
                        if (dce.aborted()) {
                            loop = false;
                        }
                    } else {
                        url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
                    }
                    final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                    failureUrlService.store(dataConfig, errorName, url, target);
                } catch (final Throwable t) {
                    logger.warn("Crawling Access Exception at : " + dataMap, t);
                    final String url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
                    final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                    failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
                }
                if (bulkRequest != null) {
                    bulkRequest.add(client.prepareDelete(hit.getIndex(), hit.getType(), hit.getId()));
                }
                if (readInterval > 0) {
                    sleep(readInterval);
                }
            }
            if (bulkRequest != null && bulkRequest.numberOfActions() > 0) {
                final BulkResponse bulkResponse = bulkRequest.execute().actionGet(timeout);
                if (bulkResponse.hasFailures()) {
                    logger.warn(bulkResponse.buildFailureMessage());
                }
            }
            if (!alive) {
                break;
            }
            response = client.prepareSearchScroll(scrollId).setScroll(scroll).execute().actionGet(timeout);
            scrollId = response.getScrollId();
        }
    } catch (final Exception e) {
        throw new DataStoreException("Failed to crawl data when acessing elasticsearch.", e);
    }
}
Also used : MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreException(org.codelibs.fess.exception.DataStoreException) SearchRequestBuilder(org.elasticsearch.action.search.SearchRequestBuilder) SearchHit(org.elasticsearch.search.SearchHit) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) BulkResponse(org.elasticsearch.action.bulk.BulkResponse) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DataStoreException(org.codelibs.fess.exception.DataStoreException) SearchResponse(org.elasticsearch.action.search.SearchResponse) LinkedHashMap(java.util.LinkedHashMap) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) SearchHits(org.elasticsearch.search.SearchHits) BulkRequestBuilder(org.elasticsearch.action.bulk.BulkRequestBuilder) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Aggregations

HashMap (java.util.HashMap)3 Map (java.util.Map)3 FailureUrlService (org.codelibs.fess.app.service.FailureUrlService)3 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)3 MultipleCrawlingAccessException (org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException)3 DataStoreCrawlingException (org.codelibs.fess.exception.DataStoreCrawlingException)3 DataStoreException (org.codelibs.fess.exception.DataStoreException)3 LinkedHashMap (java.util.LinkedHashMap)2 CsvReader (com.orangesignal.csv.CsvReader)1 BufferedReader (java.io.BufferedReader)1 FileInputStream (java.io.FileInputStream)1 InputStreamReader (java.io.InputStreamReader)1 Connection (java.sql.Connection)1 ResultSet (java.sql.ResultSet)1 SQLException (java.sql.SQLException)1 Statement (java.sql.Statement)1 FessSystemException (org.codelibs.fess.exception.FessSystemException)1 BulkRequestBuilder (org.elasticsearch.action.bulk.BulkRequestBuilder)1 BulkResponse (org.elasticsearch.action.bulk.BulkResponse)1 SearchRequestBuilder (org.elasticsearch.action.search.SearchRequestBuilder)1