use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.
the class DirectCandidateGenerator method postFilter.
protected void postFilter(final Candidate candidate, final CharsRefBuilder spare, BytesRefBuilder byteSpare, final List<Candidate> candidates) throws IOException {
if (postFilter == null) {
candidates.add(candidate);
} else {
final BytesRefBuilder result = byteSpare;
analyze(postFilter, candidate.term, field, new TokenConsumer() {
@Override
public void nextToken() throws IOException {
this.fillBytesRef(result);
if (posIncAttr.getPositionIncrement() > 0 && result.get().bytesEquals(candidate.term)) {
BytesRef term = result.toBytesRef();
// We should not use frequency(term) here because it will analyze the term again
// If preFilter and postFilter are the same analyzer it would fail.
long freq = internalFrequency(term);
candidates.add(new Candidate(result.toBytesRef(), freq, candidate.stringDistance, score(candidate.frequency, candidate.stringDistance, dictSize), false));
} else {
candidates.add(new Candidate(result.toBytesRef(), candidate.frequency, nonErrorLikelihood, score(candidate.frequency, candidate.stringDistance, dictSize), false));
}
}
}, spare);
}
}
use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.
the class NoisyChannelSpellChecker method getCorrections.
public Result getCorrections(TokenStream stream, final CandidateGenerator generator, float maxErrors, int numCorrections, WordScorer wordScorer, float confidence, int gramSize) throws IOException {
final List<CandidateSet> candidateSetsList = new ArrayList<>();
DirectCandidateGenerator.analyze(stream, new DirectCandidateGenerator.TokenConsumer() {
CandidateSet currentSet = null;
private TypeAttribute typeAttribute;
private final BytesRefBuilder termsRef = new BytesRefBuilder();
private boolean anyUnigram = false;
private boolean anyTokens = false;
@Override
public void reset(TokenStream stream) {
super.reset(stream);
typeAttribute = stream.addAttribute(TypeAttribute.class);
}
@Override
public void nextToken() throws IOException {
anyTokens = true;
BytesRef term = fillBytesRef(termsRef);
if (requireUnigram && typeAttribute.type() == ShingleFilter.DEFAULT_TOKEN_TYPE) {
return;
}
anyUnigram = true;
if (posIncAttr.getPositionIncrement() == 0 && typeAttribute.type() == SynonymFilter.TYPE_SYNONYM) {
assert currentSet != null;
long freq = 0;
if ((freq = generator.frequency(term)) > 0) {
currentSet.addOneCandidate(generator.createCandidate(BytesRef.deepCopyOf(term), freq, realWordLikelihood));
}
} else {
if (currentSet != null) {
candidateSetsList.add(currentSet);
}
currentSet = new CandidateSet(Candidate.EMPTY, generator.createCandidate(BytesRef.deepCopyOf(term), true));
}
}
@Override
public void end() {
if (currentSet != null) {
candidateSetsList.add(currentSet);
}
if (requireUnigram && !anyUnigram && anyTokens) {
throw new IllegalStateException("At least one unigram is required but all tokens were ngrams");
}
}
});
if (candidateSetsList.isEmpty() || candidateSetsList.size() >= tokenLimit) {
return Result.EMPTY;
}
for (CandidateSet candidateSet : candidateSetsList) {
generator.drawCandidates(candidateSet);
}
double cutoffScore = Double.MIN_VALUE;
CandidateScorer scorer = new CandidateScorer(wordScorer, numCorrections, gramSize);
CandidateSet[] candidateSets = candidateSetsList.toArray(new CandidateSet[candidateSetsList.size()]);
if (confidence > 0.0) {
Candidate[] candidates = new Candidate[candidateSets.length];
for (int i = 0; i < candidates.length; i++) {
candidates[i] = candidateSets[i].originalTerm;
}
double inputPhraseScore = scorer.score(candidates, candidateSets);
cutoffScore = inputPhraseScore * confidence;
}
Correction[] bestCandidates = scorer.findBestCandiates(candidateSets, maxErrors, cutoffScore);
return new Result(bestCandidates, cutoffScore);
}
use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.
the class SimpleLuceneTests method testSimpleNumericOps.
public void testSimpleNumericOps() throws Exception {
Directory dir = new RAMDirectory();
IndexWriter indexWriter = new IndexWriter(dir, new IndexWriterConfig(Lucene.STANDARD_ANALYZER));
Document document = new Document();
document.add(new TextField("_id", "1", Field.Store.YES));
document.add(new LegacyIntField("test", 2, LegacyIntField.TYPE_STORED));
indexWriter.addDocument(document);
IndexReader reader = DirectoryReader.open(indexWriter);
IndexSearcher searcher = new IndexSearcher(reader);
TopDocs topDocs = searcher.search(new TermQuery(new Term("_id", "1")), 1);
Document doc = searcher.doc(topDocs.scoreDocs[0].doc);
IndexableField f = doc.getField("test");
assertThat(f.stringValue(), equalTo("2"));
BytesRefBuilder bytes = new BytesRefBuilder();
LegacyNumericUtils.intToPrefixCoded(2, 0, bytes);
topDocs = searcher.search(new TermQuery(new Term("test", bytes.get())), 1);
doc = searcher.doc(topDocs.scoreDocs[0].doc);
f = doc.getField("test");
assertThat(f.stringValue(), equalTo("2"));
indexWriter.close();
}
use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.
the class BytesBinaryDVAtomicFieldData method getBytesValues.
@Override
public SortedBinaryDocValues getBytesValues() {
return new SortedBinaryDocValues() {
int count;
BytesRefBuilder[] refs = new BytesRefBuilder[0];
final ByteArrayDataInput in = new ByteArrayDataInput();
@Override
public void setDocument(int docId) {
final BytesRef bytes = values.get(docId);
in.reset(bytes.bytes, bytes.offset, bytes.length);
if (bytes.length == 0) {
count = 0;
} else {
count = in.readVInt();
if (count > refs.length) {
final int previousLength = refs.length;
refs = Arrays.copyOf(refs, ArrayUtil.oversize(count, RamUsageEstimator.NUM_BYTES_OBJECT_REF));
for (int i = previousLength; i < refs.length; ++i) {
refs[i] = new BytesRefBuilder();
}
}
for (int i = 0; i < count; ++i) {
final int length = in.readVInt();
final BytesRefBuilder scratch = refs[i];
scratch.grow(length);
in.readBytes(scratch.bytes(), 0, length);
scratch.setLength(length);
}
}
}
@Override
public int count() {
return count;
}
@Override
public BytesRef valueAt(int index) {
return refs[index].get();
}
};
}
use of org.apache.lucene.util.BytesRefBuilder in project elasticsearch by elastic.
the class BytesRefHashTests method assertAllIn.
private void assertAllIn(Set<String> strings, BytesRefHash hash) {
BytesRefBuilder ref = new BytesRefBuilder();
BytesRef scratch = new BytesRef();
long count = hash.size();
for (String string : strings) {
ref.copyChars(string);
// add again to check duplicates
long key = hash.add(ref.get());
assertEquals(string, hash.get((-key) - 1, scratch).utf8ToString());
assertEquals(count, hash.size());
assertTrue("key: " + key + " count: " + count + " string: " + string, key < count);
}
}
Aggregations