Examples with StandardAnalyzer - org.apache.lucene.analysis.standard.StandardAnalyzer

Example 16 with StandardAnalyzer

use of org.apache.lucene.analysis.standard.StandardAnalyzer in project OpenGrok by OpenGrok.

the class IndexDatabase method optimize.

/**
     * Optimize the index database
     */
public void optimize() {
    synchronized (lock) {
        if (running) {
            LOGGER.warning("Optimize terminated... Someone else is updating / optimizing it!");
            return;
        }
        running = true;
    }
    IndexWriter wrt = null;
    try {
        LOGGER.info("Optimizing the index ... ");
        Analyzer analyzer = new StandardAnalyzer();
        IndexWriterConfig conf = new IndexWriterConfig(analyzer);
        conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
        wrt = new IndexWriter(indexDirectory, conf);
        // this is deprecated and not needed anymore            
        wrt.forceMerge(1);
        LOGGER.info("done");
        synchronized (lock) {
            if (dirtyFile.exists() && !dirtyFile.delete()) {
                LOGGER.log(Level.FINE, "Failed to remove \"dirty-file\": {0}", dirtyFile.getAbsolutePath());
            }
            dirty = false;
        }
    } catch (IOException e) {
        LOGGER.log(Level.SEVERE, "ERROR: optimizing index: {0}", e);
    } finally {
        if (wrt != null) {
            try {
                wrt.close();
            } catch (IOException e) {
                LOGGER.log(Level.WARNING, "An error occured while closing writer", e);
            }
        }
        synchronized (lock) {
            running = false;
        }
    }
}

Also used : IndexWriter(org.apache.lucene.index.IndexWriter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) IOException(java.io.IOException) FileAnalyzer(org.opensolaris.opengrok.analysis.FileAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 17 with StandardAnalyzer

use of org.apache.lucene.analysis.standard.StandardAnalyzer in project Openfire by igniterealtime.

the class ChatSearchManager method loadAnalyzer.

/**
     * Load the search analyzer. A custom analyzer class will be used if it is defined.
     */
private void loadAnalyzer() {
    Analyzer analyzer = null;
    String analyzerClass = null;
    String words = null;
    // First check if the workgroup should use a special Analyzer
    analyzerClass = workgroup.getProperties().getProperty("search.analyzer.className");
    if (analyzerClass != null) {
        words = workgroup.getProperties().getProperty("search.analyzer.stopWordList");
    } else {
        // Use the global analyzer
        analyzerClass = getAnalyzerClass();
        words = JiveGlobals.getProperty("workgroup.search.analyzer.stopWordList");
    }
    // get stop word list is there was one
    List<String> stopWords = new ArrayList<String>();
    if (words != null) {
        StringTokenizer st = new StringTokenizer(words, ",");
        while (st.hasMoreTokens()) {
            stopWords.add(st.nextToken().trim());
        }
    }
    try {
        analyzer = getAnalyzerInstance(analyzerClass, stopWords);
    } catch (Exception e) {
        Log.error("Error loading custom " + "search analyzer: " + analyzerClass, e);
    }
    // If the analyzer is null, use the standard analyzer.
    if (analyzer == null && stopWords.size() > 0) {
        analyzer = new StandardAnalyzer(stopWords.toArray(new String[stopWords.size()]));
    } else if (analyzer == null) {
        analyzer = new StandardAnalyzer();
    }
    indexerAnalyzer = analyzer;
}

Also used : StringTokenizer(java.util.StringTokenizer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) SQLException(java.sql.SQLException) DocumentException(org.dom4j.DocumentException) IOException(java.io.IOException)

Example 18 with StandardAnalyzer

use of org.apache.lucene.analysis.standard.StandardAnalyzer in project Openfire by igniterealtime.

the class ArchiveIndexer method rebuildIndex.

/**
     * Rebuilds the search index with all archived conversation data. This method returns
     * a Future that represents the status of the index rebuild process (also available
     * via {@link #getIndexRebuildProgress()}). The integer value
     * (values 0 through 100) represents the percentage of work done. If message archiving
     * is disabled, this method will return <tt>null</tt>.
     *
     * @return a Future to indicate the status of rebuilding the index or <tt>null</tt> if
     *      rebuilding the index is not possible.
     */
public synchronized Future<Integer> rebuildIndex() {
    // Immediately return if the service has been stopped.
    if (stopped) {
        return null;
    }
    // If a rebuild is already happening, return.
    if (rebuildInProgress) {
        return null;
    }
    rebuildInProgress = true;
    // Do nothing if archiving is disabled.
    if (!conversationManager.isArchivingEnabled()) {
        return null;
    }
    // Create a future to track the index rebuild progress.
    rebuildFuture = new RebuildFuture();
    // Create a runnable that will perform the actual rebuild work.
    Runnable rebuildTask = new Runnable() {

        public void run() {
            List<Long> conversationIDs = new ArrayList<Long>();
            Map<Long, Boolean> externalMetaData = new HashMap<Long, Boolean>();
            Connection con = null;
            PreparedStatement pstmt = null;
            ResultSet rs = null;
            try {
                con = DbConnectionManager.getConnection();
                pstmt = con.prepareStatement(ALL_CONVERSATIONS);
                rs = pstmt.executeQuery();
                while (rs.next()) {
                    long conversationID = rs.getLong(1);
                    conversationIDs.add(conversationID);
                    externalMetaData.put(conversationID, rs.getInt(2) == 1);
                }
            } catch (SQLException sqle) {
                Log.error(sqle.getMessage(), sqle);
            } finally {
                DbConnectionManager.closeConnection(rs, pstmt, con);
            }
            if (!conversationIDs.isEmpty()) {
                // Index the conversations.
                writerLock.lock();
                IndexModifier writer = null;
                try {
                    writer = new IndexModifier(directory, new StandardAnalyzer(), true);
                    long newestDate = indexConversations(conversationIDs, externalMetaData, writer, true);
                    writer.optimize();
                    // Done indexing so store a last modified date.
                    if (newestDate != -1) {
                        lastModified = newestDate;
                        indexProperties.setProperty("lastModified", Long.toString(lastModified));
                    }
                } catch (IOException ioe) {
                    Log.error(ioe.getMessage(), ioe);
                } finally {
                    if (writer != null) {
                        try {
                            writer.close();
                        } catch (Exception e) {
                            Log.error(e.getMessage(), e);
                        }
                    }
                    writerLock.unlock();
                }
            }
            // Done rebuilding the index, so reset state.
            rebuildFuture = null;
            rebuildInProgress = false;
        }
    };
    taskEngine.submit(rebuildTask);
    return rebuildFuture;
}

Also used : HashMap(java.util.HashMap) SQLException(java.sql.SQLException) ArrayList(java.util.ArrayList) Connection(java.sql.Connection) PreparedStatement(java.sql.PreparedStatement) IOException(java.io.IOException) TimeoutException(java.util.concurrent.TimeoutException) SQLException(java.sql.SQLException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) IndexModifier(org.apache.lucene.index.IndexModifier) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) ResultSet(java.sql.ResultSet)

Example 19 with StandardAnalyzer

use of org.apache.lucene.analysis.standard.StandardAnalyzer in project Openfire by igniterealtime.

the class ArchiveIndexer method updateIndex.

/**
     * Updates the search index with all new conversation data since the last index update.
     */
public void updateIndex() {
    // Immediately return if the service has been stopped.
    if (stopped) {
        return;
    }
    // Do nothing if archiving is disabled.
    if (!conversationManager.isArchivingEnabled()) {
        return;
    }
    // If we're currently rebuilding the index, return.
    if (rebuildInProgress) {
        return;
    }
    writerLock.lock();
    IndexModifier writer = null;
    try {
        writer = new IndexModifier(directory, new StandardAnalyzer(), false);
        List<Long> conversationIDs = new ArrayList<Long>();
        Connection con = null;
        PreparedStatement pstmt = null;
        ResultSet rs = null;
        try {
            con = DbConnectionManager.getConnection();
            pstmt = con.prepareStatement(NEW_CONVERSATIONS);
            pstmt.setLong(1, lastModified);
            rs = pstmt.executeQuery();
            while (rs.next()) {
                conversationIDs.add(rs.getLong(1));
            }
        } catch (SQLException sqle) {
            Log.error(sqle.getMessage(), sqle);
        } finally {
            DbConnectionManager.closeConnection(rs, pstmt, con);
        }
        // updated since then.
        for (long conversationID : conversationIDs) {
            writer.deleteDocuments(new Term("conversationID", Long.toString(conversationID)));
        }
        // Load meta-data for each conversation.
        Map<Long, Boolean> externalMetaData = new HashMap<Long, Boolean>();
        for (long conversationID : conversationIDs) {
            try {
                con = DbConnectionManager.getConnection();
                pstmt = con.prepareStatement(CONVERSATION_METADATA);
                pstmt.setLong(1, conversationID);
                rs = pstmt.executeQuery();
                while (rs.next()) {
                    externalMetaData.put(conversationID, rs.getInt(1) == 1);
                }
            } catch (SQLException sqle) {
                Log.error(sqle.getMessage(), sqle);
            } finally {
                DbConnectionManager.closeConnection(rs, pstmt, con);
            }
        }
        // Now index all the new conversations.
        long newestDate = indexConversations(conversationIDs, externalMetaData, writer, false);
        writer.optimize();
        // Done indexing so store a last modified date.
        if (newestDate != -1) {
            lastModified = newestDate;
            indexProperties.setProperty("lastModified", Long.toString(lastModified));
        }
    } catch (IOException ioe) {
        Log.error(ioe.getMessage(), ioe);
    } finally {
        if (writer != null) {
            try {
                writer.close();
            } catch (Exception e) {
                Log.error(e.getMessage(), e);
            }
        }
        writerLock.unlock();
    }
}

Also used : SQLException(java.sql.SQLException) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Connection(java.sql.Connection) PreparedStatement(java.sql.PreparedStatement) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) TimeoutException(java.util.concurrent.TimeoutException) SQLException(java.sql.SQLException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) IndexModifier(org.apache.lucene.index.IndexModifier) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) ResultSet(java.sql.ResultSet)

Example 20 with StandardAnalyzer

use of org.apache.lucene.analysis.standard.StandardAnalyzer in project languagetool by languagetool-org.

the class SentenceSourceIndexer method main.

public static void main(String... args) throws Exception {
    if (args.length != 5) {
        System.out.println("Usage: " + SentenceSourceIndexer.class.getSimpleName() + " <dataFile...> <indexDir> <languageCode> <maxSentences> <indexPosTags>");
        System.out.println("\t<dataFiles> comma-separated list of a Wikipedia XML dump (*.xml) and/or Tatoeba files (tatoeba-*)");
        System.out.println("\t<indexDir> directory where Lucene index will be written to, existing index content will be removed");
        System.out.println("\t<languageCode> short code like en for English, de for German etc");
        System.out.println("\t<maxSentences> maximum number of sentences to be indexed, use 0 for no limit");
        System.out.println("\t<indexPosTags> 1 to also index POS tags (i.e. analyze text by LT), 0 to index only the plain text");
        System.exit(1);
    }
    List<String> dumpFilesNames = Arrays.asList(args[0].split(","));
    File indexDir = new File(args[1]);
    String languageCode = args[2];
    int maxSentences = Integer.parseInt(args[3]);
    Language language = Languages.getLanguageForShortCode(languageCode);
    if (maxSentences == 0) {
        System.out.println("Going to index contents from " + dumpFilesNames);
    } else {
        System.out.println("Going to index up to " + maxSentences + " sentences from " + dumpFilesNames);
    }
    System.out.println("Output index dir: " + indexDir);
    long start = System.currentTimeMillis();
    Analyzer analyzer;
    String indexPos = args[4];
    if (indexPos.equals("1")) {
        // this will use LanguageToolAnalyzer
        analyzer = null;
    } else if (indexPos.equals("0")) {
        analyzer = new StandardAnalyzer(new CharArraySet(Collections.emptyList(), false));
    } else {
        throw new IllegalArgumentException("Unknown value '" + indexPos + "' for indexPosTags parameter, use 0 or 1");
    }
    try (FSDirectory fsDirectory = FSDirectory.open(indexDir.toPath());
        SentenceSourceIndexer indexer = new SentenceSourceIndexer(fsDirectory, language, maxSentences, analyzer)) {
        try {
            indexer.run(dumpFilesNames, language);
        } catch (DocumentLimitReachedException e) {
            System.out.println("Sentence limit (" + e.getLimit() + ") reached, stopping indexing");
        } finally {
            indexer.writeMetaDocuments();
        }
        if (analyzer != null) {
            analyzer.close();
        }
    }
    long end = System.currentTimeMillis();
    float minutes = (end - start) / (float) (1000 * 60);
    System.out.printf("Indexing took %.2f minutes\n", minutes);
}

Also used : CharArraySet(org.apache.lucene.analysis.util.CharArraySet) FSDirectory(org.apache.lucene.store.FSDirectory) Analyzer(org.apache.lucene.analysis.Analyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) Language(org.languagetool.Language) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) File(java.io.File)

Aggregations

StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)112 Analyzer (org.apache.lucene.analysis.Analyzer)37 IndexWriter (org.apache.lucene.index.IndexWriter)36 Document (org.apache.lucene.document.Document)29 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)29 IndexSearcher (org.apache.lucene.search.IndexSearcher)24 Term (org.apache.lucene.index.Term)22 RAMDirectory (org.apache.lucene.store.RAMDirectory)21 Test (org.junit.Test)21 Query (org.apache.lucene.search.Query)20 BooleanQuery (org.apache.lucene.search.BooleanQuery)19 TermQuery (org.apache.lucene.search.TermQuery)19 IOException (java.io.IOException)16 Before (org.junit.Before)15 IndexReader (org.apache.lucene.index.IndexReader)14 HashMap (java.util.HashMap)13 Field (org.apache.lucene.document.Field)13 ArrayList (java.util.ArrayList)12 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)12 Directory (org.apache.lucene.store.Directory)12