Search in sources :

Example 81 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class DirectSolrSpellChecker method getSuggestions.

@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    LOG.debug("getSuggestions: " + options.tokens);
    SpellingResult result = new SpellingResult();
    float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy;
    for (Token token : options.tokens) {
        String tokenText = token.toString();
        Term term = new Term(field, tokenText);
        int freq = options.reader.docFreq(term);
        int count = (options.alternativeTermCount > 0 && freq > 0) ? options.alternativeTermCount : options.count;
        SuggestWord[] suggestions = checker.suggestSimilar(term, count, options.reader, options.suggestMode, accuracy);
        result.addFrequency(token, freq);
        // original as a viable suggestion.
        if (options.alternativeTermCount > 0 && freq > 0) {
            boolean foundOriginal = false;
            SuggestWord[] suggestionsWithOrig = new SuggestWord[suggestions.length + 1];
            for (int i = 0; i < suggestions.length; i++) {
                if (suggestions[i].string.equals(tokenText)) {
                    foundOriginal = true;
                    break;
                }
                suggestionsWithOrig[i + 1] = suggestions[i];
            }
            if (!foundOriginal) {
                SuggestWord orig = new SuggestWord();
                orig.freq = freq;
                orig.string = tokenText;
                suggestionsWithOrig[0] = orig;
                suggestions = suggestionsWithOrig;
            }
        }
        if (suggestions.length == 0 && freq == 0) {
            List<String> empty = Collections.emptyList();
            result.add(token, empty);
        } else {
            for (SuggestWord suggestion : suggestions) {
                result.add(token, suggestion.string, suggestion.freq);
            }
        }
    }
    return result;
}
Also used : SuggestWord(org.apache.lucene.search.spell.SuggestWord) Token(org.apache.lucene.analysis.Token) Term(org.apache.lucene.index.Term)

Example 82 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class SolrSpellChecker method mergeSuggestions.

/**
   * Integrate spelling suggestions from the various shards in a distributed environment.
   */
public SpellingResult mergeSuggestions(SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) {
    float min = 0.5f;
    try {
        min = getAccuracy();
    } catch (UnsupportedOperationException uoe) {
    //just use .5 as a default
    }
    StringDistance sd = null;
    try {
        sd = getStringDistance() == null ? new LevensteinDistance() : getStringDistance();
    } catch (UnsupportedOperationException uoe) {
        sd = new LevensteinDistance();
    }
    SpellingResult result = new SpellingResult();
    for (Map.Entry<String, HashSet<String>> entry : mergeData.origVsSuggested.entrySet()) {
        String original = entry.getKey();
        //Only use this suggestion if all shards reported it as misspelled, 
        //unless it was not a term original to the user's query
        //(WordBreakSolrSpellChecker can add new terms to the response, and we want to keep these)
        Integer numShards = mergeData.origVsShards.get(original);
        if (numShards < mergeData.totalNumberShardResponses && mergeData.isOriginalToQuery(original)) {
            continue;
        }
        HashSet<String> suggested = entry.getValue();
        SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
        for (String suggestion : suggested) {
            SuggestWord sug = mergeData.suggestedVsWord.get(suggestion);
            sug.score = sd.getDistance(original, sug.string);
            if (sug.score < min)
                continue;
            sugQueue.insertWithOverflow(sug);
            if (sugQueue.size() == numSug) {
                // if queue full, maintain the minScore score
                min = sugQueue.top().score;
            }
        }
        // create token
        SpellCheckResponse.Suggestion suggestion = mergeData.origVsSuggestion.get(original);
        Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset());
        // get top 'count' suggestions out of 'sugQueue.size()' candidates
        SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
        // skip the first sugQueue.size() - count elements
        for (int k = 0; k < sugQueue.size() - count; k++) sugQueue.pop();
        // now collect the top 'count' responses
        for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
            suggestions[k] = sugQueue.pop();
        }
        if (extendedResults) {
            Integer o = mergeData.origVsFreq.get(original);
            if (o != null)
                result.addFrequency(token, o);
            for (SuggestWord word : suggestions) result.add(token, word.string, word.freq);
        } else {
            List<String> words = new ArrayList<>(sugQueue.size());
            for (SuggestWord word : suggestions) words.add(word.string);
            result.add(token, words);
        }
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Token(org.apache.lucene.analysis.Token) LevensteinDistance(org.apache.lucene.search.spell.LevensteinDistance) SpellCheckResponse(org.apache.solr.client.solrj.response.SpellCheckResponse) StringDistance(org.apache.lucene.search.spell.StringDistance) SuggestWordQueue(org.apache.lucene.search.spell.SuggestWordQueue) SuggestWord(org.apache.lucene.search.spell.SuggestWord) Map(java.util.Map) HashSet(java.util.HashSet)

Example 83 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class AbstractLuceneSpellChecker method getSuggestions.

@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    SpellingResult result = new SpellingResult(options.tokens);
    IndexReader reader = determineReader(options.reader);
    Term term = field != null ? new Term(field, "") : null;
    float theAccuracy = (options.accuracy == Float.MIN_VALUE) ? spellChecker.getAccuracy() : options.accuracy;
    int count = Math.max(options.count, AbstractLuceneSpellChecker.DEFAULT_SUGGESTION_COUNT);
    for (Token token : options.tokens) {
        String tokenText = new String(token.buffer(), 0, token.length());
        term = new Term(field, tokenText);
        int docFreq = 0;
        if (reader != null) {
            docFreq = reader.docFreq(term);
        }
        String[] suggestions = spellChecker.suggestSimilar(tokenText, ((options.alternativeTermCount == 0 || docFreq == 0) ? count : // workaround LUCENE-1295
        options.alternativeTermCount), // workaround LUCENE-1295
        field != null ? reader : null, field, options.suggestMode, theAccuracy);
        if (suggestions.length == 1 && suggestions[0].equals(tokenText) && options.alternativeTermCount == 0) {
            // These are spelled the same, continue on
            continue;
        }
        // original as a viable suggestion.
        if (options.alternativeTermCount > 0 && docFreq > 0) {
            boolean foundOriginal = false;
            String[] suggestionsWithOrig = new String[suggestions.length + 1];
            for (int i = 0; i < suggestions.length; i++) {
                if (suggestions[i].equals(tokenText)) {
                    foundOriginal = true;
                    break;
                }
                suggestionsWithOrig[i + 1] = suggestions[i];
            }
            if (!foundOriginal) {
                suggestionsWithOrig[0] = tokenText;
                suggestions = suggestionsWithOrig;
            }
        }
        if (options.extendedResults == true && reader != null && field != null) {
            result.addFrequency(token, docFreq);
            int countLimit = Math.min(options.count, suggestions.length);
            if (countLimit > 0) {
                for (int i = 0; i < countLimit; i++) {
                    term = new Term(field, suggestions[i]);
                    result.add(token, suggestions[i], reader.docFreq(term));
                }
            } else {
                List<String> suggList = Collections.emptyList();
                result.add(token, suggList);
            }
        } else {
            if (suggestions.length > 0) {
                List<String> suggList = Arrays.asList(suggestions);
                if (suggestions.length > options.count) {
                    suggList = suggList.subList(0, options.count);
                }
                result.add(token, suggList);
            } else {
                List<String> suggList = Collections.emptyList();
                result.add(token, suggList);
            }
        }
    }
    return result;
}
Also used : IndexReader(org.apache.lucene.index.IndexReader) Token(org.apache.lucene.analysis.Token) Term(org.apache.lucene.index.Term)

Example 84 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class DummyCustomParamSpellChecker method getSuggestions.

@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    SpellingResult result = new SpellingResult();
    //just spit back out the results
    // sort the keys to make ordering predictable
    Iterator<String> iterator = options.customParams.getParameterNamesIterator();
    List<String> lst = new ArrayList<>();
    while (iterator.hasNext()) {
        lst.add(iterator.next());
    }
    Collections.sort(lst);
    int i = 0;
    for (String name : lst) {
        String value = options.customParams.get(name);
        result.add(new Token(name, i, i + 1), Collections.singletonList(value));
        i += 2;
    }
    return result;
}
Also used : SpellingResult(org.apache.solr.spelling.SpellingResult) ArrayList(java.util.ArrayList) Token(org.apache.lucene.analysis.Token)

Example 85 with Token

use of org.apache.lucene.analysis.Token in project lucene-solr by apache.

the class TestTermAutomatonQuery method token.

private static Token token(String term, int posInc, int posLength) {
    final Token t = new Token(term, 0, term.length());
    t.setPositionIncrement(posInc);
    t.setPositionLength(posLength);
    return t;
}
Also used : Token(org.apache.lucene.analysis.Token)

Aggregations

Token (org.apache.lucene.analysis.Token)100 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)39 TokenStream (org.apache.lucene.analysis.TokenStream)31 Directory (org.apache.lucene.store.Directory)24 Test (org.junit.Test)23 Document (org.apache.lucene.document.Document)19 TextField (org.apache.lucene.document.TextField)19 BytesRef (org.apache.lucene.util.BytesRef)16 NamedList (org.apache.solr.common.util.NamedList)16 StringReader (java.io.StringReader)15 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)15 Analyzer (org.apache.lucene.analysis.Analyzer)14 ArrayList (java.util.ArrayList)13 Map (java.util.Map)13 Field (org.apache.lucene.document.Field)13 FieldType (org.apache.lucene.document.FieldType)11 IndexReader (org.apache.lucene.index.IndexReader)11 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)10 Tokenizer (org.apache.lucene.analysis.Tokenizer)9 Date (java.util.Date)8