Search in sources :

Example 6 with SuggestWord

use of org.apache.lucene.search.spell.SuggestWord in project lucene-solr by apache.

the class DirectSolrSpellChecker method init.

@Override
public String init(NamedList config, SolrCore core) {
    SolrParams params = SolrParams.toSolrParams(config);
    LOG.info("init: " + config);
    String name = super.init(config, core);
    Comparator<SuggestWord> comp = SuggestWordQueue.DEFAULT_COMPARATOR;
    String compClass = (String) config.get(COMPARATOR_CLASS);
    if (compClass != null) {
        if (compClass.equalsIgnoreCase(SCORE_COMP))
            comp = SuggestWordQueue.DEFAULT_COMPARATOR;
        else if (compClass.equalsIgnoreCase(FREQ_COMP))
            comp = new SuggestWordFrequencyComparator();
        else
            //must be a FQCN
            comp = (Comparator<SuggestWord>) core.getResourceLoader().newInstance(compClass, Comparator.class);
    }
    StringDistance sd = DirectSpellChecker.INTERNAL_LEVENSHTEIN;
    String distClass = (String) config.get(STRING_DISTANCE);
    if (distClass != null && !distClass.equalsIgnoreCase(INTERNAL_DISTANCE))
        sd = core.getResourceLoader().newInstance(distClass, StringDistance.class);
    float minAccuracy = DEFAULT_ACCURACY;
    Float accuracy = params.getFloat(ACCURACY);
    if (accuracy != null)
        minAccuracy = accuracy;
    int maxEdits = DEFAULT_MAXEDITS;
    Integer edits = params.getInt(MAXEDITS);
    if (edits != null)
        maxEdits = edits;
    int minPrefix = DEFAULT_MINPREFIX;
    Integer prefix = params.getInt(MINPREFIX);
    if (prefix != null)
        minPrefix = prefix;
    int maxInspections = DEFAULT_MAXINSPECTIONS;
    Integer inspections = params.getInt(MAXINSPECTIONS);
    if (inspections != null)
        maxInspections = inspections;
    float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY;
    Float threshold = params.getFloat(THRESHOLD_TOKEN_FREQUENCY);
    if (threshold != null)
        minThreshold = threshold;
    int minQueryLength = DEFAULT_MINQUERYLENGTH;
    Integer queryLength = params.getInt(MINQUERYLENGTH);
    if (queryLength != null)
        minQueryLength = queryLength;
    float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
    Float queryFreq = params.getFloat(MAXQUERYFREQUENCY);
    if (queryFreq != null)
        maxQueryFrequency = queryFreq;
    checker.setComparator(comp);
    checker.setDistance(sd);
    checker.setMaxEdits(maxEdits);
    checker.setMinPrefix(minPrefix);
    checker.setAccuracy(minAccuracy);
    checker.setThresholdFrequency(minThreshold);
    checker.setMaxInspections(maxInspections);
    checker.setMinQueryLength(minQueryLength);
    checker.setMaxQueryFrequency(maxQueryFrequency);
    checker.setLowerCaseTerms(false);
    return name;
}
Also used : SuggestWordFrequencyComparator(org.apache.lucene.search.spell.SuggestWordFrequencyComparator) StringDistance(org.apache.lucene.search.spell.StringDistance) SuggestWord(org.apache.lucene.search.spell.SuggestWord) SolrParams(org.apache.solr.common.params.SolrParams)

Example 7 with SuggestWord

use of org.apache.lucene.search.spell.SuggestWord in project lucene-solr by apache.

the class IndexBasedSpellCheckerTest method testComparator.

@Test
public void testComparator() throws Exception {
    SpellCheckComponent component = (SpellCheckComponent) h.getCore().getSearchComponent("spellcheck");
    assertNotNull(component);
    AbstractLuceneSpellChecker spellChecker;
    Comparator<SuggestWord> comp;
    spellChecker = (AbstractLuceneSpellChecker) component.getSpellChecker("freq");
    assertNotNull(spellChecker);
    comp = spellChecker.getSpellChecker().getComparator();
    assertNotNull(comp);
    assertTrue(comp instanceof SuggestWordFrequencyComparator);
    spellChecker = (AbstractLuceneSpellChecker) component.getSpellChecker("fqcn");
    assertNotNull(spellChecker);
    comp = spellChecker.getSpellChecker().getComparator();
    assertNotNull(comp);
    assertTrue(comp instanceof SampleComparator);
}
Also used : SuggestWordFrequencyComparator(org.apache.lucene.search.spell.SuggestWordFrequencyComparator) SuggestWord(org.apache.lucene.search.spell.SuggestWord) SpellCheckComponent(org.apache.solr.handler.component.SpellCheckComponent) Test(org.junit.Test)

Example 8 with SuggestWord

use of org.apache.lucene.search.spell.SuggestWord in project lucene-solr by apache.

the class DirectSolrSpellChecker method getSuggestions.

@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    LOG.debug("getSuggestions: " + options.tokens);
    SpellingResult result = new SpellingResult();
    float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy;
    for (Token token : options.tokens) {
        String tokenText = token.toString();
        Term term = new Term(field, tokenText);
        int freq = options.reader.docFreq(term);
        int count = (options.alternativeTermCount > 0 && freq > 0) ? options.alternativeTermCount : options.count;
        SuggestWord[] suggestions = checker.suggestSimilar(term, count, options.reader, options.suggestMode, accuracy);
        result.addFrequency(token, freq);
        // original as a viable suggestion.
        if (options.alternativeTermCount > 0 && freq > 0) {
            boolean foundOriginal = false;
            SuggestWord[] suggestionsWithOrig = new SuggestWord[suggestions.length + 1];
            for (int i = 0; i < suggestions.length; i++) {
                if (suggestions[i].string.equals(tokenText)) {
                    foundOriginal = true;
                    break;
                }
                suggestionsWithOrig[i + 1] = suggestions[i];
            }
            if (!foundOriginal) {
                SuggestWord orig = new SuggestWord();
                orig.freq = freq;
                orig.string = tokenText;
                suggestionsWithOrig[0] = orig;
                suggestions = suggestionsWithOrig;
            }
        }
        if (suggestions.length == 0 && freq == 0) {
            List<String> empty = Collections.emptyList();
            result.add(token, empty);
        } else {
            for (SuggestWord suggestion : suggestions) {
                result.add(token, suggestion.string, suggestion.freq);
            }
        }
    }
    return result;
}
Also used : SuggestWord(org.apache.lucene.search.spell.SuggestWord) Token(org.apache.lucene.analysis.Token) Term(org.apache.lucene.index.Term)

Example 9 with SuggestWord

use of org.apache.lucene.search.spell.SuggestWord in project lucene-solr by apache.

the class SolrSpellChecker method mergeSuggestions.

/**
   * Integrate spelling suggestions from the various shards in a distributed environment.
   */
public SpellingResult mergeSuggestions(SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) {
    float min = 0.5f;
    try {
        min = getAccuracy();
    } catch (UnsupportedOperationException uoe) {
    //just use .5 as a default
    }
    StringDistance sd = null;
    try {
        sd = getStringDistance() == null ? new LevensteinDistance() : getStringDistance();
    } catch (UnsupportedOperationException uoe) {
        sd = new LevensteinDistance();
    }
    SpellingResult result = new SpellingResult();
    for (Map.Entry<String, HashSet<String>> entry : mergeData.origVsSuggested.entrySet()) {
        String original = entry.getKey();
        //Only use this suggestion if all shards reported it as misspelled, 
        //unless it was not a term original to the user's query
        //(WordBreakSolrSpellChecker can add new terms to the response, and we want to keep these)
        Integer numShards = mergeData.origVsShards.get(original);
        if (numShards < mergeData.totalNumberShardResponses && mergeData.isOriginalToQuery(original)) {
            continue;
        }
        HashSet<String> suggested = entry.getValue();
        SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
        for (String suggestion : suggested) {
            SuggestWord sug = mergeData.suggestedVsWord.get(suggestion);
            sug.score = sd.getDistance(original, sug.string);
            if (sug.score < min)
                continue;
            sugQueue.insertWithOverflow(sug);
            if (sugQueue.size() == numSug) {
                // if queue full, maintain the minScore score
                min = sugQueue.top().score;
            }
        }
        // create token
        SpellCheckResponse.Suggestion suggestion = mergeData.origVsSuggestion.get(original);
        Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset());
        // get top 'count' suggestions out of 'sugQueue.size()' candidates
        SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
        // skip the first sugQueue.size() - count elements
        for (int k = 0; k < sugQueue.size() - count; k++) sugQueue.pop();
        // now collect the top 'count' responses
        for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
            suggestions[k] = sugQueue.pop();
        }
        if (extendedResults) {
            Integer o = mergeData.origVsFreq.get(original);
            if (o != null)
                result.addFrequency(token, o);
            for (SuggestWord word : suggestions) result.add(token, word.string, word.freq);
        } else {
            List<String> words = new ArrayList<>(sugQueue.size());
            for (SuggestWord word : suggestions) words.add(word.string);
            result.add(token, words);
        }
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Token(org.apache.lucene.analysis.Token) LevensteinDistance(org.apache.lucene.search.spell.LevensteinDistance) SpellCheckResponse(org.apache.solr.client.solrj.response.SpellCheckResponse) StringDistance(org.apache.lucene.search.spell.StringDistance) SuggestWordQueue(org.apache.lucene.search.spell.SuggestWordQueue) SuggestWord(org.apache.lucene.search.spell.SuggestWord) Map(java.util.Map) HashSet(java.util.HashSet)

Example 10 with SuggestWord

use of org.apache.lucene.search.spell.SuggestWord in project lucene-solr by apache.

the class WordBreakSolrSpellChecker method getSuggestions.

@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
    IndexReader ir = options.reader;
    int numSuggestions = options.count;
    StringBuilder sb = new StringBuilder();
    Token[] tokenArr = options.tokens.toArray(new Token[options.tokens.size()]);
    List<Token> tokenArrWithSeparators = new ArrayList<>(options.tokens.size() + 2);
    List<Term> termArr = new ArrayList<>(options.tokens.size() + 2);
    List<ResultEntry> breakSuggestionList = new ArrayList<>();
    List<ResultEntry> noBreakSuggestionList = new ArrayList<>();
    boolean lastOneProhibited = false;
    boolean lastOneRequired = false;
    boolean lastOneprocedesNewBooleanOp = false;
    for (int i = 0; i < tokenArr.length; i++) {
        boolean prohibited = (tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG) == QueryConverter.PROHIBITED_TERM_FLAG;
        boolean required = (tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG) == QueryConverter.REQUIRED_TERM_FLAG;
        boolean procedesNewBooleanOp = (tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG) == QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
        if (i > 0 && (prohibited != lastOneProhibited || required != lastOneRequired || lastOneprocedesNewBooleanOp)) {
            termArr.add(WordBreakSpellChecker.SEPARATOR_TERM);
            tokenArrWithSeparators.add(null);
        }
        lastOneProhibited = prohibited;
        lastOneRequired = required;
        lastOneprocedesNewBooleanOp = procedesNewBooleanOp;
        Term thisTerm = new Term(field, tokenArr[i].toString());
        termArr.add(thisTerm);
        tokenArrWithSeparators.add(tokenArr[i]);
        if (breakWords) {
            SuggestWord[][] breakSuggestions = wbsp.suggestWordBreaks(thisTerm, numSuggestions, ir, options.suggestMode, sortMethod);
            if (breakSuggestions.length == 0) {
                noBreakSuggestionList.add(new ResultEntry(tokenArr[i], null, 0));
            }
            for (SuggestWord[] breakSuggestion : breakSuggestions) {
                sb.delete(0, sb.length());
                boolean firstOne = true;
                int freq = 0;
                for (SuggestWord word : breakSuggestion) {
                    if (!firstOne) {
                        sb.append(" ");
                    }
                    firstOne = false;
                    sb.append(word.string);
                    if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
                        freq = Math.max(freq, word.freq);
                    } else {
                        freq += word.freq;
                    }
                }
                breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(), freq));
            }
        }
    }
    breakSuggestionList.addAll(noBreakSuggestionList);
    List<ResultEntry> combineSuggestionList = Collections.emptyList();
    CombineSuggestion[] combineSuggestions = wbsp.suggestWordCombinations(termArr.toArray(new Term[termArr.size()]), numSuggestions, ir, options.suggestMode);
    if (combineWords) {
        combineSuggestionList = new ArrayList<>(combineSuggestions.length);
        for (CombineSuggestion cs : combineSuggestions) {
            int firstTermIndex = cs.originalTermIndexes[0];
            int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1];
            sb.delete(0, sb.length());
            for (int i = firstTermIndex; i <= lastTermIndex; i++) {
                if (i > firstTermIndex) {
                    sb.append(" ");
                }
                sb.append(tokenArrWithSeparators.get(i).toString());
            }
            Token token = new Token(sb.toString(), tokenArrWithSeparators.get(firstTermIndex).startOffset(), tokenArrWithSeparators.get(lastTermIndex).endOffset());
            combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string, cs.suggestion.freq));
        }
    }
    // Interleave the two lists of suggestions into one SpellingResult
    SpellingResult result = new SpellingResult();
    Iterator<ResultEntry> breakIter = breakSuggestionList.iterator();
    Iterator<ResultEntry> combineIter = combineSuggestionList.iterator();
    ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null;
    ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null;
    int breakCount = 0;
    int combineCount = 0;
    while (lastBreak != null || lastCombine != null) {
        if (lastBreak == null) {
            addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
            lastCombine = null;
        } else if (lastCombine == null) {
            addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
            lastBreak = null;
        } else if (lastBreak.freq < lastCombine.freq) {
            addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
            lastCombine = null;
        } else if (lastCombine.freq < lastBreak.freq) {
            addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
            lastBreak = null;
        } else if (breakCount >= combineCount) {
            //TODO: Should reverse >= to < ??S
            addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
            lastCombine = null;
        } else {
            addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
            lastBreak = null;
        }
        if (lastBreak == null && breakIter.hasNext()) {
            lastBreak = breakIter.next();
            breakCount++;
        }
        if (lastCombine == null && combineIter.hasNext()) {
            lastCombine = combineIter.next();
            combineCount++;
        }
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) CombineSuggestion(org.apache.lucene.search.spell.CombineSuggestion) Token(org.apache.lucene.analysis.Token) Term(org.apache.lucene.index.Term) IndexReader(org.apache.lucene.index.IndexReader) SuggestWord(org.apache.lucene.search.spell.SuggestWord)

Aggregations

SuggestWord (org.apache.lucene.search.spell.SuggestWord)12 ArrayList (java.util.ArrayList)4 IndexReader (org.apache.lucene.index.IndexReader)4 IOException (java.io.IOException)3 Token (org.apache.lucene.analysis.Token)3 Term (org.apache.lucene.index.Term)3 SuggestWordFrequencyComparator (org.apache.lucene.search.spell.SuggestWordFrequencyComparator)3 AbstractIterator (com.google.common.collect.AbstractIterator)2 Deque (java.util.Deque)2 HashSet (java.util.HashSet)2 Map (java.util.Map)2 Set (java.util.Set)2 PathStoredFieldVisitor (org.apache.jackrabbit.oak.plugins.index.lucene.util.PathStoredFieldVisitor)2 SpellcheckHelper (org.apache.jackrabbit.oak.plugins.index.lucene.util.SpellcheckHelper)2 SuggestHelper (org.apache.jackrabbit.oak.plugins.index.lucene.util.SuggestHelper)2 Filter (org.apache.jackrabbit.oak.spi.query.Filter)2 PropertyRestriction (org.apache.jackrabbit.oak.spi.query.Filter.PropertyRestriction)2 QueryLimits (org.apache.jackrabbit.oak.spi.query.QueryLimits)2 Analyzer (org.apache.lucene.analysis.Analyzer)2 Document (org.apache.lucene.document.Document)2