use of org.apache.lucene.search.FuzzyTermsEnum in project lucene-solr by apache.
the class DirectSpellChecker method suggestSimilar.
/**
* Provide spelling corrections based on several parameters.
*
* @param term The term to suggest spelling corrections for
* @param numSug The maximum number of spelling corrections
* @param ir The index reader to fetch the candidate spelling corrections from
* @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
* @param editDistance The maximum edit distance candidates are allowed to have
* @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
* @param spare a chars scratch
* @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
* @throws IOException If I/O related errors occur
*/
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRefBuilder spare) throws IOException {
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
Terms terms = MultiFields.getTerms(ir, term.field());
if (terms == null) {
return Collections.emptyList();
}
FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance - 1), true);
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
BytesRef queryTerm = new BytesRef(term.text());
BytesRef candidateTerm;
ScoreTerm st = new ScoreTerm();
BoostAttribute boostAtt = e.attributes().addAttribute(BoostAttribute.class);
while ((candidateTerm = e.next()) != null) {
// For FuzzyQuery, boost is the score:
float score = boostAtt.getBoost();
// ignore uncompetitive hits
if (stQueue.size() >= numSug && score <= stQueue.peek().boost) {
continue;
}
// ignore exact match of the same term
if (queryTerm.bytesEquals(candidateTerm)) {
continue;
}
int df = e.docFreq();
// check docFreq if required
if (df <= docfreq) {
continue;
}
final String termAsString;
if (distance == INTERNAL_LEVENSHTEIN) {
// delay creating strings until the end
termAsString = null;
} else {
spare.copyUTF8Bytes(candidateTerm);
termAsString = spare.toString();
score = distance.getDistance(term.text(), termAsString);
}
if (score < accuracy) {
continue;
}
// add new entry in PQ
st.term = BytesRef.deepCopyOf(candidateTerm);
st.boost = score;
st.docfreq = df;
st.termAsString = termAsString;
st.score = score;
stQueue.offer(st);
// possibly drop entries from queue
st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
}
return stQueue;
}
use of org.apache.lucene.search.FuzzyTermsEnum in project lucene-solr by apache.
the class FuzzyLikeThisQuery method addTerms.
private void addTerms(IndexReader reader, FieldVals f, ScoreTermQueue q) throws IOException {
if (f.queryString == null)
return;
final Terms terms = MultiFields.getTerms(reader, f.fieldName);
if (terms == null) {
return;
}
try (TokenStream ts = analyzer.tokenStream(f.fieldName, f.queryString)) {
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
int corpusNumDocs = reader.numDocs();
HashSet<String> processedTerms = new HashSet<>();
ts.reset();
while (ts.incrementToken()) {
String term = termAtt.toString();
if (!processedTerms.contains(term)) {
processedTerms.add(term);
//maxNum variants considered for any one term
ScoreTermQueue variantsQ = new ScoreTermQueue(MAX_VARIANTS_PER_TERM);
float minScore = 0;
Term startTerm = new Term(f.fieldName, term);
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
FuzzyTermsEnum fe = new FuzzyTermsEnum(terms, atts, startTerm, f.maxEdits, f.prefixLength, true);
//store the df so all variants use same idf
int df = reader.docFreq(startTerm);
int numVariants = 0;
int totalVariantDocFreqs = 0;
BytesRef possibleMatch;
BoostAttribute boostAtt = fe.attributes().addAttribute(BoostAttribute.class);
while ((possibleMatch = fe.next()) != null) {
numVariants++;
totalVariantDocFreqs += fe.docFreq();
float score = boostAtt.getBoost();
if (variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore) {
ScoreTerm st = new ScoreTerm(new Term(startTerm.field(), BytesRef.deepCopyOf(possibleMatch)), score, startTerm);
variantsQ.insertWithOverflow(st);
// maintain minScore
minScore = variantsQ.top().score;
}
maxBoostAtt.setMaxNonCompetitiveBoost(variantsQ.size() >= MAX_VARIANTS_PER_TERM ? minScore : Float.NEGATIVE_INFINITY);
}
if (numVariants > 0) {
int avgDf = totalVariantDocFreqs / numVariants;
if (//no direct match we can use as df for all variants
df == 0) {
//use avg df of all variants
df = avgDf;
}
// take the top variants (scored by edit distance) and reset the score
// to include an IDF factor then add to the global queue for ranking
// overall top query terms
int size = variantsQ.size();
for (int i = 0; i < size; i++) {
ScoreTerm st = variantsQ.pop();
st.score = (st.score * st.score) * sim.idf(df, corpusNumDocs);
q.insertWithOverflow(st);
}
}
}
}
ts.end();
}
}
Aggregations