Search in sources :

Example 6 with LevensteinDistance

use of org.apache.lucene.search.spell.LevensteinDistance in project lucene-solr by apache.

the class SolrSpellChecker method mergeSuggestions.

/**
   * Integrate spelling suggestions from the various shards in a distributed environment.
   */
public SpellingResult mergeSuggestions(SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) {
    float min = 0.5f;
    try {
        min = getAccuracy();
    } catch (UnsupportedOperationException uoe) {
    //just use .5 as a default
    }
    StringDistance sd = null;
    try {
        sd = getStringDistance() == null ? new LevensteinDistance() : getStringDistance();
    } catch (UnsupportedOperationException uoe) {
        sd = new LevensteinDistance();
    }
    SpellingResult result = new SpellingResult();
    for (Map.Entry<String, HashSet<String>> entry : mergeData.origVsSuggested.entrySet()) {
        String original = entry.getKey();
        //Only use this suggestion if all shards reported it as misspelled, 
        //unless it was not a term original to the user's query
        //(WordBreakSolrSpellChecker can add new terms to the response, and we want to keep these)
        Integer numShards = mergeData.origVsShards.get(original);
        if (numShards < mergeData.totalNumberShardResponses && mergeData.isOriginalToQuery(original)) {
            continue;
        }
        HashSet<String> suggested = entry.getValue();
        SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
        for (String suggestion : suggested) {
            SuggestWord sug = mergeData.suggestedVsWord.get(suggestion);
            sug.score = sd.getDistance(original, sug.string);
            if (sug.score < min)
                continue;
            sugQueue.insertWithOverflow(sug);
            if (sugQueue.size() == numSug) {
                // if queue full, maintain the minScore score
                min = sugQueue.top().score;
            }
        }
        // create token
        SpellCheckResponse.Suggestion suggestion = mergeData.origVsSuggestion.get(original);
        Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset());
        // get top 'count' suggestions out of 'sugQueue.size()' candidates
        SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
        // skip the first sugQueue.size() - count elements
        for (int k = 0; k < sugQueue.size() - count; k++) sugQueue.pop();
        // now collect the top 'count' responses
        for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
            suggestions[k] = sugQueue.pop();
        }
        if (extendedResults) {
            Integer o = mergeData.origVsFreq.get(original);
            if (o != null)
                result.addFrequency(token, o);
            for (SuggestWord word : suggestions) result.add(token, word.string, word.freq);
        } else {
            List<String> words = new ArrayList<>(sugQueue.size());
            for (SuggestWord word : suggestions) words.add(word.string);
            result.add(token, words);
        }
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Token(org.apache.lucene.analysis.Token) LevensteinDistance(org.apache.lucene.search.spell.LevensteinDistance) SpellCheckResponse(org.apache.solr.client.solrj.response.SpellCheckResponse) StringDistance(org.apache.lucene.search.spell.StringDistance) SuggestWordQueue(org.apache.lucene.search.spell.SuggestWordQueue) SuggestWord(org.apache.lucene.search.spell.SuggestWord) Map(java.util.Map) HashSet(java.util.HashSet)

Aggregations

LevensteinDistance (org.apache.lucene.search.spell.LevensteinDistance)6 ArrayList (java.util.ArrayList)4 HashSet (java.util.HashSet)3 IOException (java.io.IOException)2 Collections (java.util.Collections)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 Collectors (java.util.stream.Collectors)2 StringDistance (org.apache.lucene.search.spell.StringDistance)2 SuggestWord (org.apache.lucene.search.spell.SuggestWord)2 Tuple (org.elasticsearch.common.collect.Tuple)2 BufferedReader (java.io.BufferedReader)1 File (java.io.File)1 InputStream (java.io.InputStream)1 InputStreamReader (java.io.InputStreamReader)1 OutputStream (java.io.OutputStream)1 URL (java.net.URL)1 URLConnection (java.net.URLConnection)1 URLDecoder (java.net.URLDecoder)1