Search in sources :

Example 1 with DataStoreException

use of org.codelibs.fess.exception.DataStoreException in project fess by codelibs.

the class CsvDataStoreImpl method getCsvFileList.

protected List<File> getCsvFileList(final Map<String, String> paramMap) {
    String value = paramMap.get(CSV_FILES_PARAM);
    final List<File> fileList = new ArrayList<>();
    if (StringUtil.isBlank(value)) {
        value = paramMap.get(CSV_DIRS_PARAM);
        if (StringUtil.isBlank(value)) {
            throw new DataStoreException(CSV_FILES_PARAM + " and " + CSV_DIRS_PARAM + " are blank.");
        }
        logger.info(CSV_DIRS_PARAM + "=" + value);
        final String[] values = value.split(",");
        for (final String path : values) {
            final File dir = new File(path);
            if (dir.isDirectory()) {
                stream(dir.listFiles()).of(stream -> stream.filter(f -> isCsvFile(f.getParentFile(), f.getName())).sorted((f1, f2) -> (int) (f1.lastModified() - f2.lastModified())).forEach(f -> fileList.add(f)));
            } else {
                logger.warn(path + " is not a directory.");
            }
        }
    } else {
        logger.info(CSV_FILES_PARAM + "=" + value);
        final String[] values = value.split(",");
        for (final String path : values) {
            final File file = new File(path);
            if (file.isFile() && isCsvFile(file.getParentFile(), file.getName())) {
                fileList.add(file);
            } else {
                logger.warn(path + " is not found.");
            }
        }
    }
    if (fileList.isEmpty() && logger.isDebugEnabled()) {
        logger.debug("No csv files in " + value);
    }
    return fileList;
}
Also used : CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) Constants(org.codelibs.fess.Constants) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) IndexUpdateCallback(org.codelibs.fess.ds.IndexUpdateCallback) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) Locale(java.util.Locale) Map(java.util.Map) StringEscapeUtils(org.apache.commons.lang3.StringEscapeUtils) DataConfig(org.codelibs.fess.es.config.exentity.DataConfig) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) StreamUtil.stream(org.codelibs.core.stream.StreamUtil.stream) Logger(org.slf4j.Logger) StringUtil(org.codelibs.core.lang.StringUtil) FileInputStream(java.io.FileInputStream) DataStoreException(org.codelibs.fess.exception.DataStoreException) InputStreamReader(java.io.InputStreamReader) File(java.io.File) CsvConfig(com.orangesignal.csv.CsvConfig) IOUtils(org.apache.commons.io.IOUtils) List(java.util.List) ComponentUtil(org.codelibs.fess.util.ComponentUtil) BufferedReader(java.io.BufferedReader) Pattern(java.util.regex.Pattern) CsvReader(com.orangesignal.csv.CsvReader) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) DataStoreException(org.codelibs.fess.exception.DataStoreException) ArrayList(java.util.ArrayList) File(java.io.File)

Example 2 with DataStoreException

use of org.codelibs.fess.exception.DataStoreException in project fess by codelibs.

the class CsvDataStoreImpl method processCsv.

protected void processCsv(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final CsvConfig csvConfig, final File csvFile, final long readInterval, final String csvFileEncoding, final boolean hasHeaderLine) {
    logger.info("Loading " + csvFile.getAbsolutePath());
    CsvReader csvReader = null;
    try {
        csvReader = new CsvReader(new BufferedReader(new InputStreamReader(new FileInputStream(csvFile), csvFileEncoding)), csvConfig);
        List<String> headerList = null;
        if (hasHeaderLine) {
            headerList = csvReader.readValues();
        }
        List<String> list;
        boolean loop = true;
        while ((list = csvReader.readValues()) != null && loop && alive) {
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.putAll(defaultDataMap);
            final Map<String, Object> resultMap = new LinkedHashMap<>();
            resultMap.putAll(paramMap);
            resultMap.put("csvfile", csvFile.getAbsolutePath());
            resultMap.put("csvfilename", csvFile.getName());
            resultMap.put("crawlingConfig", dataConfig);
            boolean foundValues = false;
            for (int i = 0; i < list.size(); i++) {
                String key = null;
                String value = list.get(i);
                if (value == null) {
                    value = StringUtil.EMPTY;
                }
                if (StringUtil.isNotBlank(value)) {
                    foundValues = true;
                }
                if (headerList != null && headerList.size() > i) {
                    key = headerList.get(i);
                    if (StringUtil.isNotBlank(key)) {
                        resultMap.put(key, value);
                    }
                }
                key = CELL_PREFIX + Integer.toString(i + 1);
                resultMap.put(key, value);
            }
            if (!foundValues) {
                logger.debug("No data in line: {}", resultMap);
                continue;
            }
            if (logger.isDebugEnabled()) {
                for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {
                    logger.debug(entry.getKey() + "=" + entry.getValue());
                }
            }
            final Map<String, Object> crawlingContext = new HashMap<>();
            crawlingContext.put("doc", dataMap);
            resultMap.put("crawlingContext", crawlingContext);
            for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
                final Object convertValue = convertValue(entry.getValue(), resultMap);
                if (convertValue != null) {
                    dataMap.put(entry.getKey(), convertValue);
                }
            }
            if (logger.isDebugEnabled()) {
                for (final Map.Entry<String, Object> entry : dataMap.entrySet()) {
                    logger.debug(entry.getKey() + "=" + entry.getValue());
                }
            }
            try {
                callback.store(paramMap, dataMap);
            } catch (final CrawlingAccessException e) {
                logger.warn("Crawling Access Exception at : " + dataMap, e);
                Throwable target = e;
                if (target instanceof MultipleCrawlingAccessException) {
                    final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
                    if (causes.length > 0) {
                        target = causes[causes.length - 1];
                    }
                }
                String errorName;
                final Throwable cause = target.getCause();
                if (cause != null) {
                    errorName = cause.getClass().getCanonicalName();
                } else {
                    errorName = target.getClass().getCanonicalName();
                }
                String url;
                if (target instanceof DataStoreCrawlingException) {
                    final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
                    url = dce.getUrl();
                    if (dce.aborted()) {
                        loop = false;
                    }
                } else {
                    url = csvFile.getAbsolutePath() + ":" + csvReader.getLineNumber();
                }
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(dataConfig, errorName, url, target);
            } catch (final Throwable t) {
                logger.warn("Crawling Access Exception at : " + dataMap, t);
                final String url = csvFile.getAbsolutePath() + ":" + csvReader.getLineNumber();
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
            }
            if (readInterval > 0) {
                sleep(readInterval);
            }
        }
    } catch (final Exception e) {
        throw new DataStoreException("Failed to crawl data when reading csv file.", e);
    } finally {
        IOUtils.closeQuietly(csvReader);
    }
}
Also used : MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreException(org.codelibs.fess.exception.DataStoreException) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) FileInputStream(java.io.FileInputStream) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DataStoreException(org.codelibs.fess.exception.DataStoreException) LinkedHashMap(java.util.LinkedHashMap) CsvReader(com.orangesignal.csv.CsvReader) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) BufferedReader(java.io.BufferedReader) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 3 with DataStoreException

use of org.codelibs.fess.exception.DataStoreException in project fess by codelibs.

the class CsvListDataStoreImpl method storeData.

@Override
protected void storeData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
    int nThreads = 1;
    if (paramMap.containsKey(Constants.NUM_OF_THREADS)) {
        try {
            nThreads = Integer.parseInt(paramMap.get(Constants.NUM_OF_THREADS));
        } catch (final NumberFormatException e) {
            logger.warn(Constants.NUM_OF_THREADS + " is not int value.", e);
        }
    }
    final CrawlerClientFactory crawlerClientFactory = ComponentUtil.getCrawlerClientFactory();
    dataConfig.initializeClientFactory(crawlerClientFactory);
    try {
        final FileListIndexUpdateCallbackImpl fileListIndexUpdateCallback = new FileListIndexUpdateCallbackImpl(callback, crawlerClientFactory, nThreads);
        super.storeData(dataConfig, fileListIndexUpdateCallback, paramMap, scriptMap, defaultDataMap);
        fileListIndexUpdateCallback.commit();
    } catch (final Exception e) {
        throw new DataStoreException(e);
    }
}
Also used : DataStoreException(org.codelibs.fess.exception.DataStoreException) CrawlerClientFactory(org.codelibs.fess.crawler.client.CrawlerClientFactory) DataStoreException(org.codelibs.fess.exception.DataStoreException)

Example 4 with DataStoreException

use of org.codelibs.fess.exception.DataStoreException in project fess by codelibs.

the class DatabaseDataStoreImpl method storeData.

@Override
protected void storeData(final DataConfig config, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap) {
    final long readInterval = getReadInterval(paramMap);
    Connection con = null;
    Statement stmt = null;
    ResultSet rs = null;
    try {
        Class.forName(getDriverClass(paramMap));
        final String jdbcUrl = getUrl(paramMap);
        final String username = getUsername(paramMap);
        final String password = getPassword(paramMap);
        if (StringUtil.isNotEmpty(username)) {
            con = DriverManager.getConnection(jdbcUrl, username, password);
        } else {
            con = DriverManager.getConnection(jdbcUrl);
        }
        final String sql = getSql(paramMap);
        stmt = con.createStatement();
        // SQL generated by an administrator
        rs = stmt.executeQuery(sql);
        boolean loop = true;
        while (rs.next() && loop && alive) {
            final Map<String, Object> dataMap = new HashMap<>();
            dataMap.putAll(defaultDataMap);
            final Map<String, Object> crawlingContext = new HashMap<>();
            crawlingContext.put("doc", dataMap);
            for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
                final Object convertValue = convertValue(entry.getValue(), new ResultSetParamMap(config, crawlingContext, rs, paramMap));
                if (convertValue != null) {
                    dataMap.put(entry.getKey(), convertValue);
                }
            }
            try {
                callback.store(paramMap, dataMap);
            } catch (final CrawlingAccessException e) {
                logger.warn("Crawling Access Exception at : " + dataMap, e);
                Throwable target = e;
                if (target instanceof MultipleCrawlingAccessException) {
                    final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
                    if (causes.length > 0) {
                        target = causes[causes.length - 1];
                    }
                }
                String errorName;
                final Throwable cause = target.getCause();
                if (cause != null) {
                    errorName = cause.getClass().getCanonicalName();
                } else {
                    errorName = target.getClass().getCanonicalName();
                }
                String url;
                if (target instanceof DataStoreCrawlingException) {
                    final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
                    url = dce.getUrl();
                    if (dce.aborted()) {
                        loop = false;
                    }
                } else {
                    url = sql + ":" + rs.getRow();
                }
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(config, errorName, url, target);
            } catch (final Throwable t) {
                logger.warn("Crawling Access Exception at : " + dataMap, t);
                final String url = sql + ":" + rs.getRow();
                final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                failureUrlService.store(config, t.getClass().getCanonicalName(), url, t);
            }
            if (readInterval > 0) {
                sleep(readInterval);
            }
        }
    } catch (final Exception e) {
        throw new DataStoreException("Failed to crawl data in DB.", e);
    } finally {
        try {
            if (rs != null) {
                rs.close();
            }
        } catch (final SQLException e) {
            logger.warn("Failed to close a result set.", e);
        } finally {
            try {
                if (stmt != null) {
                    stmt.close();
                }
            } catch (final SQLException e) {
                logger.warn("Failed to close a statement.", e);
            } finally {
                try {
                    if (con != null) {
                        con.close();
                    }
                } catch (final SQLException e) {
                    logger.warn("Failed to close a db connection.", e);
                }
            }
        }
    }
}
Also used : MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreException(org.codelibs.fess.exception.DataStoreException) HashMap(java.util.HashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) SQLException(java.sql.SQLException) Statement(java.sql.Statement) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) Connection(java.sql.Connection) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) FessSystemException(org.codelibs.fess.exception.FessSystemException) DataStoreException(org.codelibs.fess.exception.DataStoreException) SQLException(java.sql.SQLException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) ResultSet(java.sql.ResultSet) HashMap(java.util.HashMap) Map(java.util.Map)

Example 5 with DataStoreException

use of org.codelibs.fess.exception.DataStoreException in project fess by codelibs.

the class EsDataStoreImpl method processData.

protected void processData(final DataConfig dataConfig, final IndexUpdateCallback callback, final Map<String, String> paramMap, final Map<String, String> scriptMap, final Map<String, Object> defaultDataMap, final long readInterval, final Client client) {
    final boolean deleteProcessedDoc = paramMap.getOrDefault("delete.processed.doc", Constants.FALSE).equalsIgnoreCase(Constants.TRUE);
    final String[] indices;
    if (paramMap.containsKey(INDEX)) {
        indices = paramMap.get(INDEX).trim().split(",");
    } else {
        indices = new String[] { "_all" };
    }
    final String scroll = paramMap.containsKey(SCROLL) ? paramMap.get(SCROLL).trim() : "1m";
    final String timeout = paramMap.containsKey(TIMEOUT) ? paramMap.get(TIMEOUT).trim() : "1m";
    final SearchRequestBuilder builder = client.prepareSearch(indices);
    if (paramMap.containsKey(TYPE)) {
        builder.setTypes(paramMap.get(TYPE).trim().split(","));
    }
    if (paramMap.containsKey(SIZE)) {
        builder.setSize(Integer.parseInt(paramMap.get(SIZE)));
    }
    if (paramMap.containsKey(FIELDS)) {
        builder.setFetchSource(paramMap.get(FIELDS).trim().split(","), null);
    }
    builder.setQuery(QueryBuilders.wrapperQuery(paramMap.containsKey(QUERY) ? paramMap.get(QUERY).trim() : "{\"match_all\":{}}"));
    builder.setScroll(scroll);
    builder.setPreference(paramMap.containsKey(PREFERENCE) ? paramMap.get(PREFERENCE).trim() : Constants.SEARCH_PREFERENCE_PRIMARY);
    try {
        SearchResponse response = builder.execute().actionGet(timeout);
        String scrollId = response.getScrollId();
        while (scrollId != null) {
            final SearchHits searchHits = response.getHits();
            final SearchHit[] hits = searchHits.getHits();
            if (hits.length == 0) {
                scrollId = null;
                break;
            }
            boolean loop = true;
            final BulkRequestBuilder bulkRequest = deleteProcessedDoc ? client.prepareBulk() : null;
            for (final SearchHit hit : hits) {
                if (!alive || !loop) {
                    break;
                }
                final Map<String, Object> dataMap = new HashMap<>();
                dataMap.putAll(defaultDataMap);
                final Map<String, Object> resultMap = new LinkedHashMap<>();
                resultMap.putAll(paramMap);
                resultMap.put("index", hit.getIndex());
                resultMap.put("type", hit.getType());
                resultMap.put("id", hit.getId());
                resultMap.put("version", Long.valueOf(hit.getVersion()));
                resultMap.put("hit", hit);
                resultMap.put("source", hit.getSource());
                resultMap.put("crawlingConfig", dataConfig);
                if (logger.isDebugEnabled()) {
                    for (final Map.Entry<String, Object> entry : resultMap.entrySet()) {
                        logger.debug(entry.getKey() + "=" + entry.getValue());
                    }
                }
                final Map<String, Object> crawlingContext = new HashMap<>();
                crawlingContext.put("doc", dataMap);
                resultMap.put("crawlingContext", crawlingContext);
                for (final Map.Entry<String, String> entry : scriptMap.entrySet()) {
                    final Object convertValue = convertValue(entry.getValue(), resultMap);
                    if (convertValue != null) {
                        dataMap.put(entry.getKey(), convertValue);
                    }
                }
                if (logger.isDebugEnabled()) {
                    for (final Map.Entry<String, Object> entry : dataMap.entrySet()) {
                        logger.debug(entry.getKey() + "=" + entry.getValue());
                    }
                }
                try {
                    callback.store(paramMap, dataMap);
                } catch (final CrawlingAccessException e) {
                    logger.warn("Crawling Access Exception at : " + dataMap, e);
                    Throwable target = e;
                    if (target instanceof MultipleCrawlingAccessException) {
                        final Throwable[] causes = ((MultipleCrawlingAccessException) target).getCauses();
                        if (causes.length > 0) {
                            target = causes[causes.length - 1];
                        }
                    }
                    String errorName;
                    final Throwable cause = target.getCause();
                    if (cause != null) {
                        errorName = cause.getClass().getCanonicalName();
                    } else {
                        errorName = target.getClass().getCanonicalName();
                    }
                    String url;
                    if (target instanceof DataStoreCrawlingException) {
                        final DataStoreCrawlingException dce = (DataStoreCrawlingException) target;
                        url = dce.getUrl();
                        if (dce.aborted()) {
                            loop = false;
                        }
                    } else {
                        url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
                    }
                    final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                    failureUrlService.store(dataConfig, errorName, url, target);
                } catch (final Throwable t) {
                    logger.warn("Crawling Access Exception at : " + dataMap, t);
                    final String url = hit.getIndex() + "/" + hit.getType() + "/" + hit.getId();
                    final FailureUrlService failureUrlService = ComponentUtil.getComponent(FailureUrlService.class);
                    failureUrlService.store(dataConfig, t.getClass().getCanonicalName(), url, t);
                }
                if (bulkRequest != null) {
                    bulkRequest.add(client.prepareDelete(hit.getIndex(), hit.getType(), hit.getId()));
                }
                if (readInterval > 0) {
                    sleep(readInterval);
                }
            }
            if (bulkRequest != null && bulkRequest.numberOfActions() > 0) {
                final BulkResponse bulkResponse = bulkRequest.execute().actionGet(timeout);
                if (bulkResponse.hasFailures()) {
                    logger.warn(bulkResponse.buildFailureMessage());
                }
            }
            if (!alive) {
                break;
            }
            response = client.prepareSearchScroll(scrollId).setScroll(scroll).execute().actionGet(timeout);
            scrollId = response.getScrollId();
        }
    } catch (final Exception e) {
        throw new DataStoreException("Failed to crawl data when acessing elasticsearch.", e);
    }
}
Also used : MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreException(org.codelibs.fess.exception.DataStoreException) SearchRequestBuilder(org.elasticsearch.action.search.SearchRequestBuilder) SearchHit(org.elasticsearch.search.SearchHit) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) FailureUrlService(org.codelibs.fess.app.service.FailureUrlService) BulkResponse(org.elasticsearch.action.bulk.BulkResponse) CrawlingAccessException(org.codelibs.fess.crawler.exception.CrawlingAccessException) MultipleCrawlingAccessException(org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) DataStoreException(org.codelibs.fess.exception.DataStoreException) SearchResponse(org.elasticsearch.action.search.SearchResponse) LinkedHashMap(java.util.LinkedHashMap) DataStoreCrawlingException(org.codelibs.fess.exception.DataStoreCrawlingException) SearchHits(org.elasticsearch.search.SearchHits) BulkRequestBuilder(org.elasticsearch.action.bulk.BulkRequestBuilder) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Aggregations

DataStoreException (org.codelibs.fess.exception.DataStoreException)7 HashMap (java.util.HashMap)4 Map (java.util.Map)4 FailureUrlService (org.codelibs.fess.app.service.FailureUrlService)4 CrawlingAccessException (org.codelibs.fess.crawler.exception.CrawlingAccessException)4 MultipleCrawlingAccessException (org.codelibs.fess.crawler.exception.MultipleCrawlingAccessException)4 DataStoreCrawlingException (org.codelibs.fess.exception.DataStoreCrawlingException)4 LinkedHashMap (java.util.LinkedHashMap)3 CsvReader (com.orangesignal.csv.CsvReader)2 BufferedReader (java.io.BufferedReader)2 FileInputStream (java.io.FileInputStream)2 InputStreamReader (java.io.InputStreamReader)2 CrawlerClientFactory (org.codelibs.fess.crawler.client.CrawlerClientFactory)2 CsvConfig (com.orangesignal.csv.CsvConfig)1 File (java.io.File)1 Connection (java.sql.Connection)1 ResultSet (java.sql.ResultSet)1 SQLException (java.sql.SQLException)1 Statement (java.sql.Statement)1 ArrayList (java.util.ArrayList)1