Search in sources :

Example 1 with Candidate

use of org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate in project elasticsearch by elastic.

the class Correction method join.

public BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef preTag, BytesRef postTag) {
    BytesRef[] toJoin = new BytesRef[this.candidates.length];
    int len = separator.length * this.candidates.length - 1;
    for (int i = 0; i < toJoin.length; i++) {
        Candidate candidate = candidates[i];
        if (preTag == null || candidate.userInput) {
            toJoin[i] = candidate.term;
        } else {
            final int maxLen = preTag.length + postTag.length + candidate.term.length;
            // just allocate once
            final BytesRefBuilder highlighted = new BytesRefBuilder();
            highlighted.grow(maxLen);
            if (i == 0 || candidates[i - 1].userInput) {
                highlighted.append(preTag);
            }
            highlighted.append(candidate.term);
            if (toJoin.length == i + 1 || candidates[i + 1].userInput) {
                highlighted.append(postTag);
            }
            toJoin[i] = highlighted.get();
        }
        len += toJoin[i].length;
    }
    result.grow(len);
    return WordScorer.join(separator, result, toJoin);
}
Also used : Candidate(org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) BytesRef(org.apache.lucene.util.BytesRef)

Example 2 with Candidate

use of org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate in project elasticsearch by elastic.

the class NoisyChannelSpellChecker method getCorrections.

public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
    final List<CandidateSet> candidateSetsList = new ArrayList<>();
    DirectCandidateGenerator.analyze(stream, new DirectCandidateGenerator.TokenConsumer() {

        CandidateSet currentSet = null;

        private TypeAttribute typeAttribute;

        private final BytesRefBuilder termsRef = new BytesRefBuilder();

        private boolean anyUnigram = false;

        private boolean anyTokens = false;

        @Override
        public void reset(TokenStream stream) {
            super.reset(stream);
            typeAttribute = stream.addAttribute(TypeAttribute.class);
        }

        @Override
        public void nextToken() throws IOException {
            anyTokens = true;
            BytesRef term = fillBytesRef(termsRef);
            if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) {
                return;
            }
            anyUnigram = true;
            if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) {
                assert currentSet != null;
                long freq = 0;
                if ((freq = generator.frequency(term)) > 0) {
                    currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood));
                }
            } else {
                if (currentSet != null) {
                    candidateSetsList.add(currentSet);
                }
                currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true));
            }
        }

        @Override
        public void end() {
            if (currentSet != null) {
                candidateSetsList.add(currentSet);
            }
            if (requireUnigram && !anyUnigram && anyTokens) {
                throw new IllegalStateException("At least one unigram is required but all tokens were ngrams");
            }
        }
    });
    if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
        return Result.EMPTY;
    }
    for (CandidateSet candidateSet : candidateSetsList) {
        generator.drawCandidates(candidateSet);
    }
    double cutoffScore = Double.MIN_VALUE;
    CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize);
    CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]);
    if (confidence > 0.0) {
        Candidate[] candidates = new Candidate[candidateSets.length];
        for (int i = 0; i < candidates.length; i++) {
            candidates[i] = candidateSets[i].originalTerm;
        }
        double inputPhraseScore = scorer.score(candidates, candidateSets);
        cutoffScore = inputPhraseScore * confidence;
    }
    Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
    return new Result(bestCandidates, cutoffScore);
}
Also used : Candidate(org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate) BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) IOException(java.io.IOException) CandidateSet(org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

BytesRef (org.apache.lucene.util.BytesRef)2 BytesRefBuilder (org.apache.lucene.util.BytesRefBuilder)2 Candidate (org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate)2 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)1 CandidateSet (org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.CandidateSet)1