Search in sources :

Example 1 with CandidateSet

use of org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet in project elasticsearch by elastic.

the class CandidateScorer method findCandidates.

public void findCandidates(CandidateSet[] candidates, Candidate[] path, int ord, int numMissspellingsLeft, PriorityQueue<Correction> corrections, double cutoffScore, final double pathScore) throws IOException {
    CandidateSet current = candidates[ord];
    if (ord == candidates.length - 1) {
        path[ord] = current.originalTerm;
        updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
        if (numMissspellingsLeft > 0) {
            for (int i = 0; i < current.candidates.length; i++) {
                path[ord] = current.candidates[i];
                updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            }
        }
    } else {
        if (numMissspellingsLeft > 0) {
            path[ord] = current.originalTerm;
            findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            for (int i = 0; i < current.candidates.length; i++) {
                path[ord] = current.candidates[i];
                findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
            }
        } else {
            path[ord] = current.originalTerm;
            findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize));
        }
    }
}
Also used : CandidateSet(org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet)

Example 2 with CandidateSet

use of org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet in project elasticsearch by elastic.

the class NoisyChannelSpellChecker method getCorrections.

public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
    final List<CandidateSet> candidateSetsList = new ArrayList<>();
    DirectCandidateGenerator.analyze(stream, new DirectCandidateGenerator.TokenConsumer() {

        CandidateSet currentSet = null;

        private TypeAttribute typeAttribute;

        private final BytesRefBuilder termsRef = new BytesRefBuilder();

        private boolean anyUnigram = false;

        private boolean anyTokens = false;

        @Override
        public void reset(TokenStream stream) {
            super.reset(stream);
            typeAttribute = stream.addAttribute(TypeAttribute.class);
        }

        @Override
        public void nextToken() throws IOException {
            anyTokens = true;
            BytesRef term = fillBytesRef(termsRef);
            if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) {
                return;
            }
            anyUnigram = true;
            if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) {
                assert currentSet != null;
                long freq = 0;
                if ((freq = generator.frequency(term)) > 0) {
                    currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood));
                }
            } else {
                if (currentSet != null) {
                    candidateSetsList.add(currentSet);
                }
                currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true));
            }
        }

        @Override
        public void end() {
            if (currentSet != null) {
                candidateSetsList.add(currentSet);
            }
            if (requireUnigram && !anyUnigram && anyTokens) {
                throw new IllegalStateException("At least one unigram is required but all tokens were ngrams");
            }
        }
    });
    if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
        return Result.EMPTY;
    }
    for (CandidateSet candidateSet : candidateSetsList) {
        generator.drawCandidates(candidateSet);
    }
    double cutoffScore = Double.MIN_VALUE;
    CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize);
    CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]);
    if (confidence > 0.0) {
        Candidate[] candidates = new Candidate[candidateSets.length];
        for (int i = 0; i < candidates.length; i++) {
            candidates[i] = candidateSets[i].originalTerm;
        }
        double inputPhraseScore = scorer.score(candidates, candidateSets);
        cutoffScore = inputPhraseScore * confidence;
    }
    Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
    return new Result(bestCandidates, cutoffScore);
}
Also used : Candidate(org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) IOException(java.io.IOException) CandidateSet(org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

CandidateSet (org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)1 BytesRef (org.apache.lucene.util.BytesRef)1 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)1 Candidate (org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate)1