use of org.apache.lucene.search.spell.StringDistance in project lucene-solr by apache.
the class DirectSolrSpellChecker method init.
@Override
public String init(NamedList config, SolrCore core) {
SolrParams params = SolrParams.toSolrParams(config);
LOG.info("init: " + config);
String name = super.init(config, core);
Comparator<SuggestWord> comp = SuggestWordQueue.DEFAULT_COMPARATOR;
String compClass = (String) config.get(COMPARATOR_CLASS);
if (compClass != null) {
if (compClass.equalsIgnoreCase(SCORE_COMP))
comp = SuggestWordQueue.DEFAULT_COMPARATOR;
else if (compClass.equalsIgnoreCase(FREQ_COMP))
comp = new SuggestWordFrequencyComparator();
else
//must be a FQCN
comp = (Comparator<SuggestWord>) core.getResourceLoader().newInstance(compClass, Comparator.class);
}
StringDistance sd = DirectSpellChecker.INTERNAL_LEVENSHTEIN;
String distClass = (String) config.get(STRING_DISTANCE);
if (distClass != null && !distClass.equalsIgnoreCase(INTERNAL_DISTANCE))
sd = core.getResourceLoader().newInstance(distClass, StringDistance.class);
float minAccuracy = DEFAULT_ACCURACY;
Float accuracy = params.getFloat(ACCURACY);
if (accuracy != null)
minAccuracy = accuracy;
int maxEdits = DEFAULT_MAXEDITS;
Integer edits = params.getInt(MAXEDITS);
if (edits != null)
maxEdits = edits;
int minPrefix = DEFAULT_MINPREFIX;
Integer prefix = params.getInt(MINPREFIX);
if (prefix != null)
minPrefix = prefix;
int maxInspections = DEFAULT_MAXINSPECTIONS;
Integer inspections = params.getInt(MAXINSPECTIONS);
if (inspections != null)
maxInspections = inspections;
float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY;
Float threshold = params.getFloat(THRESHOLD_TOKEN_FREQUENCY);
if (threshold != null)
minThreshold = threshold;
int minQueryLength = DEFAULT_MINQUERYLENGTH;
Integer queryLength = params.getInt(MINQUERYLENGTH);
if (queryLength != null)
minQueryLength = queryLength;
float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
Float queryFreq = params.getFloat(MAXQUERYFREQUENCY);
if (queryFreq != null)
maxQueryFrequency = queryFreq;
checker.setComparator(comp);
checker.setDistance(sd);
checker.setMaxEdits(maxEdits);
checker.setMinPrefix(minPrefix);
checker.setAccuracy(minAccuracy);
checker.setThresholdFrequency(minThreshold);
checker.setMaxInspections(maxInspections);
checker.setMinQueryLength(minQueryLength);
checker.setMaxQueryFrequency(maxQueryFrequency);
checker.setLowerCaseTerms(false);
return name;
}
use of org.apache.lucene.search.spell.StringDistance in project lucene-solr by apache.
the class IndexBasedSpellCheckerTest method testAlternateDistance.
@Test
public void testAlternateDistance() throws Exception {
TestSpellChecker checker = new TestSpellChecker();
NamedList spellchecker = new NamedList();
spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
File indexDir = createTempDir().toFile();
spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
spellchecker.add(AbstractLuceneSpellChecker.FIELD, "title");
spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
spellchecker.add(AbstractLuceneSpellChecker.STRING_DISTANCE, JaroWinklerDistance.class.getName());
SolrCore core = h.getCore();
String dictName = checker.init(spellchecker, core);
assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME, dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
RefCounted<SolrIndexSearcher> holder = core.getSearcher();
SolrIndexSearcher searcher = holder.get();
try {
checker.build(core, searcher);
SpellChecker sc = checker.getSpellChecker();
assertTrue("sc is null and it shouldn't be", sc != null);
StringDistance sd = sc.getStringDistance();
assertTrue("sd is null and it shouldn't be", sd != null);
assertTrue("sd is not an instance of " + JaroWinklerDistance.class.getName(), sd instanceof JaroWinklerDistance);
} finally {
holder.decref();
}
}
use of org.apache.lucene.search.spell.StringDistance in project molgenis by molgenis.
the class SemanticSearchServiceImpl method bestMatchingSynonym.
/**
* Computes the best matching synonym which is closest to a set of search terms.<br/>
* Will stem the {@link OntologyTerm} 's synonyms and the search terms, and then compute the maximum
* {@link StringDistance} between them. 0 means disjunct, 1 means identical
*
* @param ontologyTerm the {@link OntologyTerm}
* @param searchTerms the search terms
* @return the maximum {@link StringDistance} between the ontologyterm and the search terms
*/
public Hit<String> bestMatchingSynonym(OntologyTerm ontologyTerm, Set<String> searchTerms) {
Stemmer stemmer = new Stemmer();
Optional<Hit<String>> bestSynonym = ontologyTerm.getSynonyms().stream().map(synonym -> Hit.create(synonym, distanceFrom(synonym, searchTerms, stemmer))).max(Comparator.naturalOrder());
return bestSynonym.get();
}
use of org.apache.lucene.search.spell.StringDistance in project lucene-solr by apache.
the class SolrSpellChecker method mergeSuggestions.
/**
* Integrate spelling suggestions from the various shards in a distributed environment.
*/
public SpellingResult mergeSuggestions(SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) {
float min = 0.5f;
try {
min = getAccuracy();
} catch (UnsupportedOperationException uoe) {
//just use .5 as a default
}
StringDistance sd = null;
try {
sd = getStringDistance() == null ? new LevensteinDistance() : getStringDistance();
} catch (UnsupportedOperationException uoe) {
sd = new LevensteinDistance();
}
SpellingResult result = new SpellingResult();
for (Map.Entry<String, HashSet<String>> entry : mergeData.origVsSuggested.entrySet()) {
String original = entry.getKey();
//Only use this suggestion if all shards reported it as misspelled,
//unless it was not a term original to the user's query
//(WordBreakSolrSpellChecker can add new terms to the response, and we want to keep these)
Integer numShards = mergeData.origVsShards.get(original);
if (numShards < mergeData.totalNumberShardResponses && mergeData.isOriginalToQuery(original)) {
continue;
}
HashSet<String> suggested = entry.getValue();
SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
for (String suggestion : suggested) {
SuggestWord sug = mergeData.suggestedVsWord.get(suggestion);
sug.score = sd.getDistance(original, sug.string);
if (sug.score < min)
continue;
sugQueue.insertWithOverflow(sug);
if (sugQueue.size() == numSug) {
// if queue full, maintain the minScore score
min = sugQueue.top().score;
}
}
// create token
SpellCheckResponse.Suggestion suggestion = mergeData.origVsSuggestion.get(original);
Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset());
// get top 'count' suggestions out of 'sugQueue.size()' candidates
SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
// skip the first sugQueue.size() - count elements
for (int k = 0; k < sugQueue.size() - count; k++) sugQueue.pop();
// now collect the top 'count' responses
for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
suggestions[k] = sugQueue.pop();
}
if (extendedResults) {
Integer o = mergeData.origVsFreq.get(original);
if (o != null)
result.addFrequency(token, o);
for (SuggestWord word : suggestions) result.add(token, word.string, word.freq);
} else {
List<String> words = new ArrayList<>(sugQueue.size());
for (SuggestWord word : suggestions) words.add(word.string);
result.add(token, words);
}
}
return result;
}
Aggregations