use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class NumericFacets method getCountsSingleValue.
private static NamedList<Integer> getCountsSingleValue(SolrIndexSearcher searcher, DocSet docs, String fieldName, int offset, int limit, int mincount, boolean missing, String sort) throws IOException {
boolean zeros = mincount <= 0;
mincount = Math.max(mincount, 1);
final SchemaField sf = searcher.getSchema().getField(fieldName);
final FieldType ft = sf.getType();
final NumberType numericType = ft.getNumberType();
if (numericType == null) {
throw new IllegalStateException();
}
// We don't return zeros when using PointFields or when index=false
zeros = zeros && !ft.isPointField() && sf.indexed();
final List<LeafReaderContext> leaves = searcher.getIndexReader().leaves();
// 1. accumulate
final HashTable hashTable = new HashTable(true);
final Iterator<LeafReaderContext> ctxIt = leaves.iterator();
LeafReaderContext ctx = null;
NumericDocValues longs = null;
int missingCount = 0;
for (DocIterator docsIt = docs.iterator(); docsIt.hasNext(); ) {
final int doc = docsIt.nextDoc();
if (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc()) {
do {
ctx = ctxIt.next();
} while (ctx == null || doc >= ctx.docBase + ctx.reader().maxDoc());
assert doc >= ctx.docBase;
switch(numericType) {
case LONG:
case DATE:
case INTEGER:
// Long, Date and Integer
longs = DocValues.getNumeric(ctx.reader(), fieldName);
break;
case FLOAT:
// TODO: this bit flipping should probably be moved to tie-break in the PQ comparator
longs = new FilterNumericDocValues(DocValues.getNumeric(ctx.reader(), fieldName)) {
@Override
public long longValue() throws IOException {
long bits = super.longValue();
if (bits < 0)
bits ^= 0x7fffffffffffffffL;
return bits;
}
};
break;
case DOUBLE:
// TODO: this bit flipping should probably be moved to tie-break in the PQ comparator
longs = new FilterNumericDocValues(DocValues.getNumeric(ctx.reader(), fieldName)) {
@Override
public long longValue() throws IOException {
long bits = super.longValue();
if (bits < 0)
bits ^= 0x7fffffffffffffffL;
return bits;
}
};
break;
default:
throw new AssertionError("Unexpected type: " + numericType);
}
}
int valuesDocID = longs.docID();
if (valuesDocID < doc - ctx.docBase) {
valuesDocID = longs.advance(doc - ctx.docBase);
}
if (valuesDocID == doc - ctx.docBase) {
hashTable.add(doc, longs.longValue(), 1);
} else {
++missingCount;
}
}
// 2. select top-k facet values
final int pqSize = limit < 0 ? hashTable.size : Math.min(offset + limit, hashTable.size);
final PriorityQueue<Entry> pq;
if (FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
pq = new PriorityQueue<Entry>(pqSize) {
@Override
protected boolean lessThan(Entry a, Entry b) {
if (a.count < b.count || (a.count == b.count && a.bits > b.bits)) {
return true;
} else {
return false;
}
}
};
} else {
pq = new PriorityQueue<Entry>(pqSize) {
@Override
protected boolean lessThan(Entry a, Entry b) {
return a.bits > b.bits;
}
};
}
Entry e = null;
for (int i = 0; i < hashTable.bits.length; ++i) {
if (hashTable.counts[i] >= mincount) {
if (e == null) {
e = new Entry();
}
e.bits = hashTable.bits[i];
e.count = hashTable.counts[i];
e.docID = hashTable.docIDs[i];
e = pq.insertWithOverflow(e);
}
}
// 4. build the NamedList
final ValueSource vs = ft.getValueSource(sf, null);
final NamedList<Integer> result = new NamedList<>();
// to be merged with terms from the terms dict
if (!zeros || FacetParams.FACET_SORT_COUNT.equals(sort) || FacetParams.FACET_SORT_COUNT_LEGACY.equals(sort)) {
// Only keep items we're interested in
final Deque<Entry> counts = new ArrayDeque<>();
while (pq.size() > offset) {
counts.addFirst(pq.pop());
}
// Entries from the PQ first, then using the terms dictionary
for (Entry entry : counts) {
final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
result.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
}
if (zeros && (limit < 0 || result.size() < limit)) {
// need to merge with the term dict
if (!sf.indexed() && !sf.hasDocValues()) {
throw new IllegalStateException("Cannot use " + FacetParams.FACET_MINCOUNT + "=0 on field " + sf.getName() + " which is neither indexed nor docValues");
}
// Add zeros until there are limit results
final Set<String> alreadySeen = new HashSet<>();
while (pq.size() > 0) {
Entry entry = pq.pop();
final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
alreadySeen.add(values.strVal(entry.docID - leaves.get(readerIdx).docBase));
}
for (int i = 0; i < result.size(); ++i) {
alreadySeen.add(result.getName(i));
}
final Terms terms = searcher.getSlowAtomicReader().terms(fieldName);
if (terms != null) {
final String prefixStr = TrieField.getMainValuePrefix(ft);
final BytesRef prefix;
if (prefixStr != null) {
prefix = new BytesRef(prefixStr);
} else {
prefix = new BytesRef();
}
final TermsEnum termsEnum = terms.iterator();
BytesRef term;
switch(termsEnum.seekCeil(prefix)) {
case FOUND:
case NOT_FOUND:
term = termsEnum.term();
break;
case END:
term = null;
break;
default:
throw new AssertionError();
}
final CharsRefBuilder spare = new CharsRefBuilder();
for (int skipped = hashTable.size; skipped < offset && term != null && StringHelper.startsWith(term, prefix); ) {
ft.indexedToReadable(term, spare);
final String termStr = spare.toString();
if (!alreadySeen.contains(termStr)) {
++skipped;
}
term = termsEnum.next();
}
for (; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
ft.indexedToReadable(term, spare);
final String termStr = spare.toString();
if (!alreadySeen.contains(termStr)) {
result.add(termStr, 0);
}
}
}
}
} else {
// => Merge the PQ and the terms dictionary on the fly
if (!sf.indexed()) {
throw new IllegalStateException("Cannot use " + FacetParams.FACET_SORT + "=" + FacetParams.FACET_SORT_INDEX + " on a field which is not indexed");
}
final Map<String, Integer> counts = new HashMap<>();
while (pq.size() > 0) {
final Entry entry = pq.pop();
final int readerIdx = ReaderUtil.subIndex(entry.docID, leaves);
final FunctionValues values = vs.getValues(Collections.emptyMap(), leaves.get(readerIdx));
counts.put(values.strVal(entry.docID - leaves.get(readerIdx).docBase), entry.count);
}
final Terms terms = searcher.getSlowAtomicReader().terms(fieldName);
if (terms != null) {
final String prefixStr = TrieField.getMainValuePrefix(ft);
final BytesRef prefix;
if (prefixStr != null) {
prefix = new BytesRef(prefixStr);
} else {
prefix = new BytesRef();
}
final TermsEnum termsEnum = terms.iterator();
BytesRef term;
switch(termsEnum.seekCeil(prefix)) {
case FOUND:
case NOT_FOUND:
term = termsEnum.term();
break;
case END:
term = null;
break;
default:
throw new AssertionError();
}
final CharsRefBuilder spare = new CharsRefBuilder();
for (int i = 0; i < offset && term != null && StringHelper.startsWith(term, prefix); ++i) {
term = termsEnum.next();
}
for (; term != null && StringHelper.startsWith(term, prefix) && (limit < 0 || result.size() < limit); term = termsEnum.next()) {
ft.indexedToReadable(term, spare);
final String termStr = spare.toString();
Integer count = counts.get(termStr);
if (count == null) {
count = 0;
}
result.add(termStr, count);
}
}
}
if (missing) {
result.add(null, missingCount);
}
return result;
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class SolrIndexSplitter method split.
FixedBitSet[] split(LeafReaderContext readerContext) throws IOException {
LeafReader reader = readerContext.reader();
FixedBitSet[] docSets = new FixedBitSet[numPieces];
for (int i = 0; i < docSets.length; i++) {
docSets[i] = new FixedBitSet(reader.maxDoc());
}
Bits liveDocs = reader.getLiveDocs();
Fields fields = reader.fields();
Terms terms = fields == null ? null : fields.terms(field.getName());
TermsEnum termsEnum = terms == null ? null : terms.iterator();
if (termsEnum == null)
return docSets;
BytesRef term = null;
PostingsEnum postingsEnum = null;
int[] docsMatchingRanges = null;
if (ranges != null) {
// +1 because documents can belong to *zero*, one, several or all ranges in rangesArr
docsMatchingRanges = new int[rangesArr.length + 1];
}
CharsRefBuilder idRef = new CharsRefBuilder();
for (; ; ) {
term = termsEnum.next();
if (term == null)
break;
// figure out the hash for the term
// FUTURE: if conversion to strings costs too much, we could
// specialize and use the hash function that can work over bytes.
field.getType().indexedToReadable(term, idRef);
String idString = idRef.toString();
if (splitKey != null) {
// todo have composite routers support these kind of things instead
String part1 = getRouteKey(idString);
if (part1 == null)
continue;
if (!splitKey.equals(part1)) {
continue;
}
}
int hash = 0;
if (hashRouter != null) {
hash = hashRouter.sliceHash(idString, null, null, null);
}
postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
postingsEnum = BitsFilteredPostingsEnum.wrap(postingsEnum, liveDocs);
for (; ; ) {
int doc = postingsEnum.nextDoc();
if (doc == DocIdSetIterator.NO_MORE_DOCS)
break;
if (ranges == null) {
docSets[currPartition].set(doc);
currPartition = (currPartition + 1) % numPieces;
} else {
int matchingRangesCount = 0;
for (int i = 0; i < rangesArr.length; i++) {
// inner-loop: use array here for extra speed.
if (rangesArr[i].includes(hash)) {
docSets[i].set(doc);
++matchingRangesCount;
}
}
docsMatchingRanges[matchingRangesCount]++;
}
}
}
if (docsMatchingRanges != null) {
for (int ii = 0; ii < docsMatchingRanges.length; ii++) {
if (0 == docsMatchingRanges[ii])
continue;
switch(ii) {
case 0:
// document loss
log.error("Splitting {}: {} documents belong to no shards and will be dropped", reader, docsMatchingRanges[ii]);
break;
case 1:
// normal case, each document moves to one of the sub-shards
log.info("Splitting {}: {} documents will move into a sub-shard", reader, docsMatchingRanges[ii]);
break;
default:
// document duplication
log.error("Splitting {}: {} documents will be moved to multiple ({}) sub-shards", reader, docsMatchingRanges[ii], ii);
break;
}
}
}
return docSets;
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class MoreLikeThis method addTermFrequencies.
/**
* Adds terms and frequencies found in vector into the Map termFreqMap
*
* @param field2termFreqMap a Map of terms and their frequencies per field
* @param vector List of terms and their frequencies for a doc/field
*/
private void addTermFrequencies(Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName) throws IOException {
Map<String, Int> termFreqMap = field2termFreqMap.get(fieldName);
if (termFreqMap == null) {
termFreqMap = new HashMap<>();
field2termFreqMap.put(fieldName, termFreqMap);
}
final TermsEnum termsEnum = vector.iterator();
final CharsRefBuilder spare = new CharsRefBuilder();
BytesRef text;
while ((text = termsEnum.next()) != null) {
spare.copyUTF8Bytes(text);
final String term = spare.toString();
if (isNoiseWord(term)) {
continue;
}
final int freq = (int) termsEnum.totalTermFreq();
// increment frequency
Int cnt = termFreqMap.get(term);
if (cnt == null) {
cnt = new Int();
termFreqMap.put(term, cnt);
cnt.x = freq;
} else {
cnt.x += freq;
}
}
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class FreeTextSuggester method lookup.
/** Retrieve suggestions. */
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, int num) throws IOException {
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
if (fst == null) {
throw new IllegalStateException("Lookup not supported at this time");
}
try (TokenStream ts = queryAnalyzer.tokenStream("", key.toString())) {
TermToBytesRefAttribute termBytesAtt = ts.addAttribute(TermToBytesRefAttribute.class);
OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
PositionLengthAttribute posLenAtt = ts.addAttribute(PositionLengthAttribute.class);
PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
ts.reset();
BytesRefBuilder[] lastTokens = new BytesRefBuilder[grams];
//System.out.println("lookup: key='" + key + "'");
// Run full analysis, but save only the
// last 1gram, last 2gram, etc.:
int maxEndOffset = -1;
boolean sawRealToken = false;
while (ts.incrementToken()) {
BytesRef tokenBytes = termBytesAtt.getBytesRef();
sawRealToken |= tokenBytes.length > 0;
// TODO: this is somewhat iffy; today, ShingleFilter
// sets posLen to the gram count; maybe we should make
// a separate dedicated att for this?
int gramCount = posLenAtt.getPositionLength();
assert gramCount <= grams;
// Safety: make sure the recalculated count "agrees":
if (countGrams(tokenBytes) != gramCount) {
throw new IllegalArgumentException("tokens must not contain separator byte; got token=" + tokenBytes + " but gramCount=" + gramCount + " does not match recalculated count=" + countGrams(tokenBytes));
}
maxEndOffset = Math.max(maxEndOffset, offsetAtt.endOffset());
BytesRefBuilder b = new BytesRefBuilder();
b.append(tokenBytes);
lastTokens[gramCount - 1] = b;
}
ts.end();
if (!sawRealToken) {
throw new IllegalArgumentException("no tokens produced by analyzer, or the only tokens were empty strings");
}
// Carefully fill last tokens with _ tokens;
// ShingleFilter appraently won't emit "only hole"
// tokens:
int endPosInc = posIncAtt.getPositionIncrement();
// Note this will also be true if input is the empty
// string (in which case we saw no tokens and
// maxEndOffset is still -1), which in fact works out OK
// because we fill the unigram with an empty BytesRef
// below:
boolean lastTokenEnded = offsetAtt.endOffset() > maxEndOffset || endPosInc > 0;
if (lastTokenEnded) {
// starting with "foo":
for (int i = grams - 1; i > 0; i--) {
BytesRefBuilder token = lastTokens[i - 1];
if (token == null) {
continue;
}
token.append(separator);
lastTokens[i] = token;
}
lastTokens[0] = new BytesRefBuilder();
}
Arc<Long> arc = new Arc<>();
BytesReader bytesReader = fst.getBytesReader();
// Try highest order models first, and if they return
// results, return that; else, fallback:
double backoff = 1.0;
List<LookupResult> results = new ArrayList<>(num);
// We only add a given suffix once, from the highest
// order model that saw it; for subsequent lower order
// models we skip it:
final Set<BytesRef> seen = new HashSet<>();
for (int gram = grams - 1; gram >= 0; gram--) {
BytesRefBuilder token = lastTokens[gram];
// Don't make unigram predictions from empty string:
if (token == null || (token.length() == 0 && key.length() > 0)) {
//System.out.println(" gram=" + gram + ": skip: not enough input");
continue;
}
if (endPosInc > 0 && gram <= endPosInc) {
//System.out.println(" break: only holes now");
break;
}
//System.out.println("try " + (gram+1) + " gram token=" + token.utf8ToString());
// TODO: we could add fuzziness here
// match the prefix portion exactly
//Pair<Long,BytesRef> prefixOutput = null;
Long prefixOutput = null;
try {
prefixOutput = lookupPrefix(fst, bytesReader, token.get(), arc);
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
if (prefixOutput == null) {
// This model never saw this prefix, e.g. the
// trigram model never saw context "purple mushroom"
backoff *= ALPHA;
continue;
}
// TODO: we could do this division at build time, and
// bake it into the FST?
// Denominator for computing scores from current
// model's predictions:
long contextCount = totTokens;
BytesRef lastTokenFragment = null;
for (int i = token.length() - 1; i >= 0; i--) {
if (token.byteAt(i) == separator) {
BytesRef context = new BytesRef(token.bytes(), 0, i);
Long output = Util.get(fst, Util.toIntsRef(context, new IntsRefBuilder()));
assert output != null;
contextCount = decodeWeight(output);
lastTokenFragment = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
break;
}
}
final BytesRefBuilder finalLastToken = new BytesRefBuilder();
if (lastTokenFragment == null) {
finalLastToken.copyBytes(token.get());
} else {
finalLastToken.copyBytes(lastTokenFragment);
}
CharsRefBuilder spare = new CharsRefBuilder();
// complete top-N
TopResults<Long> completions = null;
try {
// Because we store multiple models in one FST
// (1gram, 2gram, 3gram), we must restrict the
// search so that it only considers the current
// model. For highest order model, this is not
// necessary since all completions in the FST
// must be from this model, but for lower order
// models we have to filter out the higher order
// ones:
// Must do num+seen.size() for queue depth because we may
// reject up to seen.size() paths in acceptResult():
Util.TopNSearcher<Long> searcher = new Util.TopNSearcher<Long>(fst, num, num + seen.size(), weightComparator) {
BytesRefBuilder scratchBytes = new BytesRefBuilder();
@Override
protected void addIfCompetitive(Util.FSTPath<Long> path) {
if (path.arc.label != separator) {
//System.out.println(" keep path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
super.addIfCompetitive(path);
} else {
//System.out.println(" prevent path: " + Util.toBytesRef(path.input, new BytesRef()).utf8ToString() + "; " + path + "; arc=" + path.arc);
}
}
@Override
protected boolean acceptResult(IntsRef input, Long output) {
Util.toBytesRef(input, scratchBytes);
finalLastToken.grow(finalLastToken.length() + scratchBytes.length());
int lenSav = finalLastToken.length();
finalLastToken.append(scratchBytes);
//System.out.println(" accept? input='" + scratchBytes.utf8ToString() + "'; lastToken='" + finalLastToken.utf8ToString() + "'; return " + (seen.contains(finalLastToken) == false));
boolean ret = seen.contains(finalLastToken.get()) == false;
finalLastToken.setLength(lenSav);
return ret;
}
};
// since this search is initialized with a single start node
// it is okay to start with an empty input path here
searcher.addStartPaths(arc, prefixOutput, true, new IntsRefBuilder());
completions = searcher.search();
assert completions.isComplete;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
int prefixLength = token.length();
BytesRefBuilder suffix = new BytesRefBuilder();
nextCompletion: for (Result<Long> completion : completions) {
token.setLength(prefixLength);
// append suffix
Util.toBytesRef(completion.input, suffix);
token.append(suffix);
//System.out.println(" completion " + token.utf8ToString());
// Skip this path if a higher-order model already
// saw/predicted its last token:
BytesRef lastToken = token.get();
for (int i = token.length() - 1; i >= 0; i--) {
if (token.byteAt(i) == separator) {
assert token.length() - i - 1 > 0;
lastToken = new BytesRef(token.bytes(), i + 1, token.length() - i - 1);
break;
}
}
if (seen.contains(lastToken)) {
//System.out.println(" skip dup " + lastToken.utf8ToString());
continue nextCompletion;
}
seen.add(BytesRef.deepCopyOf(lastToken));
spare.copyUTF8Bytes(token.get());
LookupResult result = new LookupResult(spare.toString(), (long) (Long.MAX_VALUE * backoff * ((double) decodeWeight(completion.output)) / contextCount));
results.add(result);
assert results.size() == seen.size();
//System.out.println(" add result=" + result);
}
backoff *= ALPHA;
}
Collections.sort(results, new Comparator<LookupResult>() {
@Override
public int compare(LookupResult a, LookupResult b) {
if (a.value > b.value) {
return -1;
} else if (a.value < b.value) {
return 1;
} else {
// Tie break by UTF16 sort order:
return ((String) a.key).compareTo((String) b.key);
}
}
});
if (results.size() > num) {
results.subList(num, results.size()).clear();
}
return results;
}
}
use of org.apache.lucene.util.CharsRefBuilder in project lucene-solr by apache.
the class AnalyzingSuggester method lookup.
@Override
public List<LookupResult> lookup(final CharSequence key, Set<BytesRef> contexts, boolean onlyMorePopular, int num) {
assert num > 0;
if (onlyMorePopular) {
throw new IllegalArgumentException("this suggester only works with onlyMorePopular=false");
}
if (contexts != null) {
throw new IllegalArgumentException("this suggester doesn't support contexts");
}
if (fst == null) {
return Collections.emptyList();
}
//System.out.println("lookup key=" + key + " num=" + num);
for (int i = 0; i < key.length(); i++) {
if (key.charAt(i) == 0x1E) {
throw new IllegalArgumentException("lookup key cannot contain HOLE character U+001E; this character is reserved");
}
if (key.charAt(i) == 0x1F) {
throw new IllegalArgumentException("lookup key cannot contain unit separator character U+001F; this character is reserved");
}
}
final BytesRef utf8Key = new BytesRef(key);
try {
Automaton lookupAutomaton = toLookupAutomaton(key);
final CharsRefBuilder spare = new CharsRefBuilder();
//System.out.println(" now intersect exactFirst=" + exactFirst);
// Intersect automaton w/ suggest wFST and get all
// prefix starting nodes & their outputs:
//final PathIntersector intersector = getPathIntersector(lookupAutomaton, fst);
//System.out.println(" prefixPaths: " + prefixPaths.size());
BytesReader bytesReader = fst.getBytesReader();
FST.Arc<Pair<Long, BytesRef>> scratchArc = new FST.Arc<>();
final List<LookupResult> results = new ArrayList<>();
List<FSTUtil.Path<Pair<Long, BytesRef>>> prefixPaths = FSTUtil.intersectPrefixPaths(convertAutomaton(lookupAutomaton), fst);
if (exactFirst) {
int count = 0;
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
count++;
}
}
// Searcher just to find the single exact only
// match, if present:
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<>(fst, count * maxSurfaceFormsPerAnalyzedForm, count * maxSurfaceFormsPerAnalyzedForm, weightComparator);
// ...:
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
if (fst.findTargetArc(END_BYTE, path.fstNode, scratchArc, bytesReader) != null) {
// This node has END_BYTE arc leaving, meaning it's an
// "exact" match:
searcher.addStartPaths(scratchArc, fst.outputs.add(path.output, scratchArc.output), false, path.input);
}
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
assert completions.isComplete;
// maxSurfaceFormsPerAnalyzedForm:
for (Result<Pair<Long, BytesRef>> completion : completions) {
BytesRef output2 = completion.output.output2;
if (sameSurfaceForm(utf8Key, output2)) {
results.add(getLookupResult(completion.output.output1, output2, spare));
break;
}
}
if (results.size() == num) {
// That was quick:
return results;
}
}
Util.TopNSearcher<Pair<Long, BytesRef>> searcher;
searcher = new Util.TopNSearcher<Pair<Long, BytesRef>>(fst, num - results.size(), num * maxAnalyzedPathsForOneInput, weightComparator) {
private final Set<BytesRef> seen = new HashSet<>();
@Override
protected boolean acceptResult(IntsRef input, Pair<Long, BytesRef> output) {
// can get duplicate surface forms:
if (seen.contains(output.output2)) {
return false;
}
seen.add(output.output2);
if (!exactFirst) {
return true;
} else {
// create duplicate results:
if (sameSurfaceForm(utf8Key, output.output2)) {
// have already found it in the first search:
assert results.size() == 1;
return false;
} else {
return true;
}
}
}
};
prefixPaths = getFullPrefixPaths(prefixPaths, lookupAutomaton, fst);
for (FSTUtil.Path<Pair<Long, BytesRef>> path : prefixPaths) {
searcher.addStartPaths(path.fstNode, path.output, true, path.input);
}
TopResults<Pair<Long, BytesRef>> completions = searcher.search();
assert completions.isComplete;
for (Result<Pair<Long, BytesRef>> completion : completions) {
LookupResult result = getLookupResult(completion.output.output1, completion.output.output2, spare);
// TODO: for fuzzy case would be nice to return
// how many edits were required
//System.out.println(" result=" + result);
results.add(result);
if (results.size() == num) {
// produce one extra path
break;
}
}
return results;
} catch (IOException bogus) {
throw new RuntimeException(bogus);
}
}
Aggregations