Search in sources :

Example 1 with StringDistance

use of org.apache.lucene.search.spell.StringDistance in project lucene-solr by apache.

the class DirectSolrSpellChecker method init.

@Override
public String init(NamedList config, SolrCore core) {
    SolrParams params = SolrParams.toSolrParams(config);
    LOG.info("init: " + config);
    String name = super.init(config, core);
    Comparator<SuggestWord> comp = SuggestWordQueue.DEFAULT_COMPARATOR;
    String compClass = (String) config.get(COMPARATOR_CLASS);
    if (compClass != null) {
        if (compClass.equalsIgnoreCase(SCORE_COMP))
            comp = SuggestWordQueue.DEFAULT_COMPARATOR;
        else if (compClass.equalsIgnoreCase(FREQ_COMP))
            comp = new SuggestWordFrequencyComparator();
        else
            //must be a FQCN
            comp = (Comparator<SuggestWord>) core.getResourceLoader().newInstance(compClass, Comparator.class);
    }
    StringDistance sd = DirectSpellChecker.INTERNAL_LEVENSHTEIN;
    String distClass = (String) config.get(STRING_DISTANCE);
    if (distClass != null && !distClass.equalsIgnoreCase(INTERNAL_DISTANCE))
        sd = core.getResourceLoader().newInstance(distClass, StringDistance.class);
    float minAccuracy = DEFAULT_ACCURACY;
    Float accuracy = params.getFloat(ACCURACY);
    if (accuracy != null)
        minAccuracy = accuracy;
    int maxEdits = DEFAULT_MAXEDITS;
    Integer edits = params.getInt(MAXEDITS);
    if (edits != null)
        maxEdits = edits;
    int minPrefix = DEFAULT_MINPREFIX;
    Integer prefix = params.getInt(MINPREFIX);
    if (prefix != null)
        minPrefix = prefix;
    int maxInspections = DEFAULT_MAXINSPECTIONS;
    Integer inspections = params.getInt(MAXINSPECTIONS);
    if (inspections != null)
        maxInspections = inspections;
    float minThreshold = DEFAULT_THRESHOLD_TOKEN_FREQUENCY;
    Float threshold = params.getFloat(THRESHOLD_TOKEN_FREQUENCY);
    if (threshold != null)
        minThreshold = threshold;
    int minQueryLength = DEFAULT_MINQUERYLENGTH;
    Integer queryLength = params.getInt(MINQUERYLENGTH);
    if (queryLength != null)
        minQueryLength = queryLength;
    float maxQueryFrequency = DEFAULT_MAXQUERYFREQUENCY;
    Float queryFreq = params.getFloat(MAXQUERYFREQUENCY);
    if (queryFreq != null)
        maxQueryFrequency = queryFreq;
    checker.setComparator(comp);
    checker.setDistance(sd);
    checker.setMaxEdits(maxEdits);
    checker.setMinPrefix(minPrefix);
    checker.setAccuracy(minAccuracy);
    checker.setThresholdFrequency(minThreshold);
    checker.setMaxInspections(maxInspections);
    checker.setMinQueryLength(minQueryLength);
    checker.setMaxQueryFrequency(maxQueryFrequency);
    checker.setLowerCaseTerms(false);
    return name;
}
Also used : SuggestWordFrequencyComparator(org.apache.lucene.search.spell.SuggestWordFrequencyComparator) StringDistance(org.apache.lucene.search.spell.StringDistance) SuggestWord(org.apache.lucene.search.spell.SuggestWord) SolrParams(org.apache.solr.common.params.SolrParams)

Example 2 with StringDistance

use of org.apache.lucene.search.spell.StringDistance in project lucene-solr by apache.

the class IndexBasedSpellCheckerTest method testAlternateDistance.

@Test
public void testAlternateDistance() throws Exception {
    TestSpellChecker checker = new TestSpellChecker();
    NamedList spellchecker = new NamedList();
    spellchecker.add("classname", IndexBasedSpellChecker.class.getName());
    File indexDir = createTempDir().toFile();
    spellchecker.add(AbstractLuceneSpellChecker.INDEX_DIR, indexDir.getAbsolutePath());
    spellchecker.add(AbstractLuceneSpellChecker.FIELD, "title");
    spellchecker.add(AbstractLuceneSpellChecker.SPELLCHECKER_ARG_NAME, spellchecker);
    spellchecker.add(AbstractLuceneSpellChecker.STRING_DISTANCE, JaroWinklerDistance.class.getName());
    SolrCore core = h.getCore();
    String dictName = checker.init(spellchecker, core);
    assertTrue(dictName + " is not equal to " + SolrSpellChecker.DEFAULT_DICTIONARY_NAME, dictName.equals(SolrSpellChecker.DEFAULT_DICTIONARY_NAME) == true);
    RefCounted<SolrIndexSearcher> holder = core.getSearcher();
    SolrIndexSearcher searcher = holder.get();
    try {
        checker.build(core, searcher);
        SpellChecker sc = checker.getSpellChecker();
        assertTrue("sc is null and it shouldn't be", sc != null);
        StringDistance sd = sc.getStringDistance();
        assertTrue("sd is null and it shouldn't be", sd != null);
        assertTrue("sd is not an instance of " + JaroWinklerDistance.class.getName(), sd instanceof JaroWinklerDistance);
    } finally {
        holder.decref();
    }
}
Also used : JaroWinklerDistance(org.apache.lucene.search.spell.JaroWinklerDistance) StringDistance(org.apache.lucene.search.spell.StringDistance) NamedList(org.apache.solr.common.util.NamedList) SolrCore(org.apache.solr.core.SolrCore) SpellChecker(org.apache.lucene.search.spell.SpellChecker) SolrIndexSearcher(org.apache.solr.search.SolrIndexSearcher) File(java.io.File) Test(org.junit.Test)

Example 3 with StringDistance

use of org.apache.lucene.search.spell.StringDistance in project molgenis by molgenis.

the class SemanticSearchServiceImpl method bestMatchingSynonym.

/**
 * Computes the best matching synonym which is closest to a set of search terms.<br/>
 * Will stem the {@link OntologyTerm} 's synonyms and the search terms, and then compute the maximum
 * {@link StringDistance} between them. 0 means disjunct, 1 means identical
 *
 * @param ontologyTerm the {@link OntologyTerm}
 * @param searchTerms  the search terms
 * @return the maximum {@link StringDistance} between the ontologyterm and the search terms
 */
public Hit<String> bestMatchingSynonym(OntologyTerm ontologyTerm, Set<String> searchTerms) {
    Stemmer stemmer = new Stemmer();
    Optional<Hit<String>> bestSynonym = ontologyTerm.getSynonyms().stream().map(synonym -> Hit.create(synonym, distanceFrom(synonym, searchTerms, stemmer))).max(Comparator.naturalOrder());
    return bestSynonym.get();
}
Also used : NGramDistanceAlgorithm(org.molgenis.semanticsearch.string.NGramDistanceAlgorithm) java.util(java.util) StringDistance(org.apache.lucene.search.spell.StringDistance) Operator(org.molgenis.data.QueryRule.Operator) LoggerFactory(org.slf4j.LoggerFactory) SemanticSearchService(org.molgenis.semanticsearch.service.SemanticSearchService) QueryImpl(org.molgenis.data.support.QueryImpl) StringUtils(org.apache.commons.lang3.StringUtils) Attribute(org.molgenis.data.meta.model.Attribute) MetaDataService(org.molgenis.data.meta.MetaDataService) Lists(com.google.common.collect.Lists) FluentIterable(com.google.common.collect.FluentIterable) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) OntologyTerm(org.molgenis.ontology.core.model.OntologyTerm) Objects.requireNonNull(java.util.Objects.requireNonNull) AttributeMetadata(org.molgenis.data.meta.model.AttributeMetadata) ExplainedAttribute(org.molgenis.semanticsearch.explain.bean.ExplainedAttribute) OntologyService(org.molgenis.ontology.core.service.OntologyService) Splitter(com.google.common.base.Splitter) ATTRIBUTE_META_DATA(org.molgenis.data.meta.model.AttributeMetadata.ATTRIBUTE_META_DATA) Hit(org.molgenis.semanticsearch.semantic.Hit) Stemmer(org.molgenis.semanticsearch.string.Stemmer) Explanation(org.apache.lucene.search.Explanation) Logger(org.slf4j.Logger) ElasticSearchExplainService(org.molgenis.semanticsearch.explain.service.ElasticSearchExplainService) ExplainedQueryString(org.molgenis.semanticsearch.explain.bean.ExplainedQueryString) EntityType(org.molgenis.data.meta.model.EntityType) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) Ontology(org.molgenis.ontology.core.model.Ontology) Stream(java.util.stream.Stream) Ordering(com.google.common.collect.Ordering) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) DataService(org.molgenis.data.DataService) Query(org.molgenis.data.Query) QueryRule(org.molgenis.data.QueryRule) Joiner(com.google.common.base.Joiner) Entity(org.molgenis.data.Entity) Hit(org.molgenis.semanticsearch.semantic.Hit) Stemmer(org.molgenis.semanticsearch.string.Stemmer)

Example 4 with StringDistance

use of org.apache.lucene.search.spell.StringDistance in project lucene-solr by apache.

the class SolrSpellChecker method mergeSuggestions.

/**
   * Integrate spelling suggestions from the various shards in a distributed environment.
   */
public SpellingResult mergeSuggestions(SpellCheckMergeData mergeData, int numSug, int count, boolean extendedResults) {
    float min = 0.5f;
    try {
        min = getAccuracy();
    } catch (UnsupportedOperationException uoe) {
    //just use .5 as a default
    }
    StringDistance sd = null;
    try {
        sd = getStringDistance() == null ? new LevensteinDistance() : getStringDistance();
    } catch (UnsupportedOperationException uoe) {
        sd = new LevensteinDistance();
    }
    SpellingResult result = new SpellingResult();
    for (Map.Entry<String, HashSet<String>> entry : mergeData.origVsSuggested.entrySet()) {
        String original = entry.getKey();
        //Only use this suggestion if all shards reported it as misspelled, 
        //unless it was not a term original to the user's query
        //(WordBreakSolrSpellChecker can add new terms to the response, and we want to keep these)
        Integer numShards = mergeData.origVsShards.get(original);
        if (numShards < mergeData.totalNumberShardResponses && mergeData.isOriginalToQuery(original)) {
            continue;
        }
        HashSet<String> suggested = entry.getValue();
        SuggestWordQueue sugQueue = new SuggestWordQueue(numSug);
        for (String suggestion : suggested) {
            SuggestWord sug = mergeData.suggestedVsWord.get(suggestion);
            sug.score = sd.getDistance(original, sug.string);
            if (sug.score < min)
                continue;
            sugQueue.insertWithOverflow(sug);
            if (sugQueue.size() == numSug) {
                // if queue full, maintain the minScore score
                min = sugQueue.top().score;
            }
        }
        // create token
        SpellCheckResponse.Suggestion suggestion = mergeData.origVsSuggestion.get(original);
        Token token = new Token(original, suggestion.getStartOffset(), suggestion.getEndOffset());
        // get top 'count' suggestions out of 'sugQueue.size()' candidates
        SuggestWord[] suggestions = new SuggestWord[Math.min(count, sugQueue.size())];
        // skip the first sugQueue.size() - count elements
        for (int k = 0; k < sugQueue.size() - count; k++) sugQueue.pop();
        // now collect the top 'count' responses
        for (int k = Math.min(count, sugQueue.size()) - 1; k >= 0; k--) {
            suggestions[k] = sugQueue.pop();
        }
        if (extendedResults) {
            Integer o = mergeData.origVsFreq.get(original);
            if (o != null)
                result.addFrequency(token, o);
            for (SuggestWord word : suggestions) result.add(token, word.string, word.freq);
        } else {
            List<String> words = new ArrayList<>(sugQueue.size());
            for (SuggestWord word : suggestions) words.add(word.string);
            result.add(token, words);
        }
    }
    return result;
}
Also used : ArrayList(java.util.ArrayList) Token(org.apache.lucene.analysis.Token) LevensteinDistance(org.apache.lucene.search.spell.LevensteinDistance) SpellCheckResponse(org.apache.solr.client.solrj.response.SpellCheckResponse) StringDistance(org.apache.lucene.search.spell.StringDistance) SuggestWordQueue(org.apache.lucene.search.spell.SuggestWordQueue) SuggestWord(org.apache.lucene.search.spell.SuggestWord) Map(java.util.Map) HashSet(java.util.HashSet)

Aggregations

StringDistance (org.apache.lucene.search.spell.StringDistance)4 SuggestWord (org.apache.lucene.search.spell.SuggestWord)2 Joiner (com.google.common.base.Joiner)1 Splitter (com.google.common.base.Splitter)1 FluentIterable (com.google.common.collect.FluentIterable)1 Lists (com.google.common.collect.Lists)1 Ordering (com.google.common.collect.Ordering)1 Sets (com.google.common.collect.Sets)1 File (java.io.File)1 java.util (java.util)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Objects.requireNonNull (java.util.Objects.requireNonNull)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1 StringUtils (org.apache.commons.lang3.StringUtils)1 Token (org.apache.lucene.analysis.Token)1 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)1