use of org.apache.lucene.search.spell.SuggestWord in project lucene-solr by apache.
the class DirectSolrSpellChecker method init.
@Override
public String init(NamedList config, SolrCore core) {
SolrParams params = SolrParams.toSolrParams(config);
LOG.info("init: " + config);
String name = super.init(config, core);
Comparator<SuggestWord> comp = SuggestWordQueue.DEFAULT_COMPARATOR;
String compClass = (String) config.get(COMPARATOR_CLASS);
if (compClass != null) {
if (compClass.equalsIgnoreCase(SCORE_COMP))
comp = SuggestWordQueue.DEFAULT_COMPARATOR;
else if (compClass.equalsIgnoreCase(FREQ_COMP))
comp = new SuggestWordFrequencyComparator();
else
//must be a FQCN
comp = (Comparator<SuggestWord>) core.getResourceLoader().newInstance(compClass, Comparator.class);
}
StringDistance sd = DirectSpellChecker.INTERNAL_LEVENSHTEIN;
String distClass = (String) config.get(STRING_DISTANCE);
if (distClass != null && !distClass.equalsIgnoreCase(INTERNAL_DISTANCE))
sd = core.getResourceLoader().newInstance(distClass, StringDistance.class);
float minAccuracy = DEFAULT_ACCURACY;
Float accuracy = params.getFloat(ACCURACY);
if (accuracy != null)
minAccuracy = accuracy;
int maxEdits = DEFAULT_MAXEDITS;
Integer edits = params.getInt(MAXEDITS);
if (edits != null)
maxEdits = edits;
int minPrefix = DEFAULT_MINPREFIX;
Integer prefix = params.getInt(MINPREFIX);
if (prefix != null)
minPrefix = prefix;
int maxInspections = DEFAULT_MAXINSPECTIONS;
Integer inspections = params.getInt(MAXINSPECTIONS);
if (inspections != null)
maxInspections = inspections;
float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY;
Float threshold = params.getFloat(THRESHOLD_TOKEN_FREQUENCY);
if (threshold != null)
minThreshold = threshold;
int minQueryLength = DEFAULT_MINQUERYLENGTH;
Integer queryLength = params.getInt(MINQUERYLENGTH);
if (queryLength != null)
minQueryLength = queryLength;
float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
Float queryFreq = params.getFloat(MAXQUERYFREQUENCY);
if (queryFreq != null)
maxQueryFrequency = queryFreq;
checker.setComparator(comp);
checker.setDistance(sd);
checker.setMaxEdits(maxEdits);
checker.setMinPrefix(minPrefix);
checker.setAccuracy(minAccuracy);
checker.setThresholdFrequency(minThreshold);
checker.setMaxInspections(maxInspections);
checker.setMinQueryLength(minQueryLength);
checker.setMaxQueryFrequency(maxQueryFrequency);
checker.setLowerCaseTerms(false);
return name;
}
use of org.apache.lucene.search.spell.SuggestWord in project lucene-solr by apache.
the class IndexBasedSpellCheckerTest method testComparator.
@Test
public void testComparator() throws Exception {
SpellCheckComponent component = (SpellCheckComponent) h.getCore().getSearchComponent("spellcheck");
assertNotNull(component);
AbstractLuceneSpellChecker spellChecker;
Comparator<SuggestWord> comp;
spellChecker = (AbstractLuceneSpellChecker) component.getSpellChecker("freq");
assertNotNull(spellChecker);
comp = spellChecker.getSpellChecker().getComparator();
assertNotNull(comp);
assertTrue(comp instanceof SuggestWordFrequencyComparator);
spellChecker = (AbstractLuceneSpellChecker) component.getSpellChecker("fqcn");
assertNotNull(spellChecker);
comp = spellChecker.getSpellChecker().getComparator();
assertNotNull(comp);
assertTrue(comp instanceof SampleComparator);
}
use of org.apache.lucene.search.spell.SuggestWord in project lucene-solr by apache.
the class DirectSolrSpellChecker method getSuggestions.
@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
LOG.debug("getSuggestions: " + options.tokens);
SpellingResult result = new SpellingResult();
float accuracy = (options.accuracy == Float.MIN_VALUE) ? checker.getAccuracy() : options.accuracy;
for (Token token : options.tokens) {
String tokenText = token.toString();
Term term = new Term(field, tokenText);
int freq = options.reader.docFreq(term);
int count = (options.alternativeTermCount > 0 && freq > 0) ? options.alternativeTermCount : options.count;
SuggestWord[] suggestions = checker.suggestSimilar(term, count, options.reader, options.suggestMode, accuracy);
result.addFrequency(token, freq);
// original as a viable suggestion.
if (options.alternativeTermCount > 0 && freq > 0) {
boolean foundOriginal = false;
SuggestWord[] suggestionsWithOrig = new SuggestWord[suggestions.length + 1];
for (int i = 0; i < suggestions.length; i++) {
if (suggestions[i].string.equals(tokenText)) {
foundOriginal = true;
break;
}
suggestionsWithOrig[i + 1] = suggestions[i];
}
if (!foundOriginal) {
SuggestWord orig = new SuggestWord();
orig.freq = freq;
orig.string = tokenText;
suggestionsWithOrig[0] = orig;
suggestions = suggestionsWithOrig;
}
}
if (suggestions.length == 0 && freq == 0) {
List<String> empty = Collections.emptyList();
result.add(token, empty);
} else {
for (SuggestWord suggestion : suggestions) {
result.add(token, suggestion.string, suggestion.freq);
}
}
}
return result;
}
use of org.apache.lucene.search.spell.SuggestWord in project lucene-solr by apache.
the class SolrSpellChecker method mergeSuggestions.
/**
* Integrate spelling suggestions from the various shards in a distributed environment.
*/
public SpellingResult mergeSuggestions(SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) {
float min = 0.5f;
try {
min = getAccuracy();
} catch (UnsupportedOperationException uoe) {
//just use .5 as a default
}
StringDistance sd = null;
try {
sd = getStringDistance() == null ? new LevensteinDistance() : getStringDistance();
} catch (UnsupportedOperationException uoe) {
sd = new LevensteinDistance();
}
SpellingResult result = new SpellingResult();
for (Map.Entry<String, HashSet<String>> entry : mergeData.origVsSuggested.entrySet()) {
String original = entry.getKey();
//Only use this suggestion if all shards reported it as misspelled,
//unless it was not a term original to the user's query
//(WordBreakSolrSpellChecker can add new terms to the response, and we want to keep these)
Integer numShards = mergeData.origVsShards.get(original);
if (numShards < mergeData.totalNumberShardResponses && mergeData.isOriginalToQuery(original)) {
continue;
}
HashSet<String> suggested = entry.getValue();
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
for (String suggestion : suggested) {
SuggestWord sug = mergeData.suggestedVsWord.get(suggestion);
sug.score = sd.getDistance(original, sug.string);
if (sug.score < min)
continue;
sugQueue.insertWithOverflow(sug);
if (sugQueue.size() == numSug) {
// if queue full, maintain the minScore score
min = sugQueue.top().score;
}
}
// create token
SpellCheckResponse.Suggestion suggestion = mergeData.origVsSuggestion.get(original);
Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset());
// get top 'count' suggestions out of 'sugQueue.size()' candidates
SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
// skip the first sugQueue.size() - count elements
for (int k = 0; k < sugQueue.size() - count; k++) sugQueue.pop();
// now collect the top 'count' responses
for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
suggestions[k] = sugQueue.pop();
}
if (extendedResults) {
Integer o = mergeData.origVsFreq.get(original);
if (o != null)
result.addFrequency(token, o);
for (SuggestWord word : suggestions) result.add(token, word.string, word.freq);
} else {
List<String> words = new ArrayList<>(sugQueue.size());
for (SuggestWord word : suggestions) words.add(word.string);
result.add(token, words);
}
}
return result;
}
use of org.apache.lucene.search.spell.SuggestWord in project lucene-solr by apache.
the class WordBreakSolrSpellChecker method getSuggestions.
@Override
public SpellingResult getSuggestions(SpellingOptions options) throws IOException {
IndexReader ir = options.reader;
int numSuggestions = options.count;
StringBuilder sb = new StringBuilder();
Token[] tokenArr = options.tokens.toArray(new Token[options.tokens.size()]);
List<Token> tokenArrWithSeparators = new ArrayList<>(options.tokens.size() + 2);
List<Term> termArr = new ArrayList<>(options.tokens.size() + 2);
List<ResultEntry> breakSuggestionList = new ArrayList<>();
List<ResultEntry> noBreakSuggestionList = new ArrayList<>();
boolean lastOneProhibited = false;
boolean lastOneRequired = false;
boolean lastOneprocedesNewBooleanOp = false;
for (int i = 0; i < tokenArr.length; i++) {
boolean prohibited = (tokenArr[i].getFlags() & QueryConverter.PROHIBITED_TERM_FLAG) == QueryConverter.PROHIBITED_TERM_FLAG;
boolean required = (tokenArr[i].getFlags() & QueryConverter.REQUIRED_TERM_FLAG) == QueryConverter.REQUIRED_TERM_FLAG;
boolean procedesNewBooleanOp = (tokenArr[i].getFlags() & QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG) == QueryConverter.TERM_PRECEDES_NEW_BOOLEAN_OPERATOR_FLAG;
if (i > 0 && (prohibited != lastOneProhibited || required != lastOneRequired || lastOneprocedesNewBooleanOp)) {
termArr.add(WordBreakSpellChecker.SEPARATOR_TERM);
tokenArrWithSeparators.add(null);
}
lastOneProhibited = prohibited;
lastOneRequired = required;
lastOneprocedesNewBooleanOp = procedesNewBooleanOp;
Term thisTerm = new Term(field, tokenArr[i].toString());
termArr.add(thisTerm);
tokenArrWithSeparators.add(tokenArr[i]);
if (breakWords) {
SuggestWord[][] breakSuggestions = wbsp.suggestWordBreaks(thisTerm, numSuggestions, ir, options.suggestMode, sortMethod);
if (breakSuggestions.length == 0) {
noBreakSuggestionList.add(new ResultEntry(tokenArr[i], null, 0));
}
for (SuggestWord[] breakSuggestion : breakSuggestions) {
sb.delete(0, sb.length());
boolean firstOne = true;
int freq = 0;
for (SuggestWord word : breakSuggestion) {
if (!firstOne) {
sb.append(" ");
}
firstOne = false;
sb.append(word.string);
if (sortMethod == BreakSuggestionSortMethod.NUM_CHANGES_THEN_MAX_FREQUENCY) {
freq = Math.max(freq, word.freq);
} else {
freq += word.freq;
}
}
breakSuggestionList.add(new ResultEntry(tokenArr[i], sb.toString(), freq));
}
}
}
breakSuggestionList.addAll(noBreakSuggestionList);
List<ResultEntry> combineSuggestionList = Collections.emptyList();
CombineSuggestion[] combineSuggestions = wbsp.suggestWordCombinations(termArr.toArray(new Term[termArr.size()]), numSuggestions, ir, options.suggestMode);
if (combineWords) {
combineSuggestionList = new ArrayList<>(combineSuggestions.length);
for (CombineSuggestion cs : combineSuggestions) {
int firstTermIndex = cs.originalTermIndexes[0];
int lastTermIndex = cs.originalTermIndexes[cs.originalTermIndexes.length - 1];
sb.delete(0, sb.length());
for (int i = firstTermIndex; i <= lastTermIndex; i++) {
if (i > firstTermIndex) {
sb.append(" ");
}
sb.append(tokenArrWithSeparators.get(i).toString());
}
Token token = new Token(sb.toString(), tokenArrWithSeparators.get(firstTermIndex).startOffset(), tokenArrWithSeparators.get(lastTermIndex).endOffset());
combineSuggestionList.add(new ResultEntry(token, cs.suggestion.string, cs.suggestion.freq));
}
}
// Interleave the two lists of suggestions into one SpellingResult
SpellingResult result = new SpellingResult();
Iterator<ResultEntry> breakIter = breakSuggestionList.iterator();
Iterator<ResultEntry> combineIter = combineSuggestionList.iterator();
ResultEntry lastBreak = breakIter.hasNext() ? breakIter.next() : null;
ResultEntry lastCombine = combineIter.hasNext() ? combineIter.next() : null;
int breakCount = 0;
int combineCount = 0;
while (lastBreak != null || lastCombine != null) {
if (lastBreak == null) {
addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
lastCombine = null;
} else if (lastCombine == null) {
addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
lastBreak = null;
} else if (lastBreak.freq < lastCombine.freq) {
addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
lastCombine = null;
} else if (lastCombine.freq < lastBreak.freq) {
addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
lastBreak = null;
} else if (breakCount >= combineCount) {
//TODO: Should reverse >= to < ??S
addToResult(result, lastCombine.token, getCombineFrequency(ir, lastCombine.token), lastCombine.suggestion, lastCombine.freq);
lastCombine = null;
} else {
addToResult(result, lastBreak.token, ir.docFreq(new Term(field, lastBreak.token.toString())), lastBreak.suggestion, lastBreak.freq);
lastBreak = null;
}
if (lastBreak == null && breakIter.hasNext()) {
lastBreak = breakIter.next();
breakCount++;
}
if (lastCombine == null && combineIter.hasNext()) {
lastCombine = combineIter.next();
combineCount++;
}
}
return result;
}
Aggregations