use of org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate in project elasticsearch by elastic.
the class Correction method join.
public BytesRef join(BytesRef separator, BytesRefBuilder result, BytesRef preTag, BytesRef postTag) {
BytesRef[] toJoin = new BytesRef[this.candidates.length];
int len = separator.length * this.candidates.length - 1;
for (int i = 0; i < toJoin.length; i++) {
Candidate candidate = candidates[i];
if (preTag == null || candidate.userInput) {
toJoin[i] = candidate.term;
} else {
final int maxLen = preTag.length + postTag.length + candidate.term.length;
// just allocate once
final BytesRefBuilder highlighted = new BytesRefBuilder();
highlighted.grow(maxLen);
if (i == 0 || candidates[i - 1].userInput) {
highlighted.append(preTag);
}
highlighted.append(candidate.term);
if (toJoin.length == i + 1 || candidates[i + 1].userInput) {
highlighted.append(postTag);
}
toJoin[i] = highlighted.get();
}
len += toJoin[i].length;
}
result.grow(len);
return WordScorer.join(separator, result, toJoin);
}
use of org.elasticsearch.search.suggest.phrase.DirectCandidateGenerator.Candidate in project elasticsearch by elastic.
the class NoisyChannelSpellChecker method getCorrections.
public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
final List<CandidateSet> candidateSetsList = new ArrayList<>();
DirectCandidateGenerator.analyze(stream, new DirectCandidateGenerator.TokenConsumer() {
CandidateSet currentSet = null;
private TypeAttribute typeAttribute;
private final BytesRefBuilder termsRef = new BytesRefBuilder();
private boolean anyUnigram = false;
private boolean anyTokens = false;
@Override
public void reset(TokenStream stream) {
super.reset(stream);
typeAttribute = stream.addAttribute(TypeAttribute.class);
}
@Override
public void nextToken() throws IOException {
anyTokens = true;
BytesRef term = fillBytesRef(termsRef);
if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) {
return;
}
anyUnigram = true;
if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) {
assert currentSet != null;
long freq = 0;
if ((freq = generator.frequency(term)) > 0) {
currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood));
}
} else {
if (currentSet != null) {
candidateSetsList.add(currentSet);
}
currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true));
}
}
@Override
public void end() {
if (currentSet != null) {
candidateSetsList.add(currentSet);
}
if (requireUnigram && !anyUnigram && anyTokens) {
throw new IllegalStateException("At least one unigram is required but all tokens were ngrams");
}
}
});
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
return Result.EMPTY;
}
for (CandidateSet candidateSet : candidateSetsList) {
generator.drawCandidates(candidateSet);
}
double cutoffScore = Double.MIN_VALUE;
CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize);
CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]);
if (confidence > 0.0) {
Candidate[] candidates = new Candidate[candidateSets.length];
for (int i = 0; i < candidates.length; i++) {
candidates[i] = candidateSets[i].originalTerm;
}
double inputPhraseScore = scorer.score(candidates, candidateSets);
cutoffScore = inputPhraseScore * confidence;
}
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
return new Result(bestCandidates, cutoffScore);
}
Aggregations