use of java.util.PriorityQueue in project lucene-solr by apache.
the class NearestNeighbor method nearest.
// TODO: can we somehow share more with, or simply directly use, the LatLonPointDistanceComparator? It's really doing the same thing as
// our hitQueue...
public static NearestHit[] nearest(double pointLat, double pointLon, List<BKDReader> readers, List<Bits> liveDocs, List<Integer> docBases, final int n) throws IOException {
//System.out.println("NEAREST: readers=" + readers + " liveDocs=" + liveDocs + " pointLat=" + pointLat + " pointLon=" + pointLon);
// Holds closest collected points seen so far:
// TODO: if we used lucene's PQ we could just updateTop instead of poll/offer:
final PriorityQueue<NearestHit> hitQueue = new PriorityQueue<>(n, new Comparator<NearestHit>() {
@Override
public int compare(NearestHit a, NearestHit b) {
// sort by opposite distanceMeters natural order
int cmp = Double.compare(a.distanceMeters, b.distanceMeters);
if (cmp != 0) {
return -cmp;
}
// tie-break by higher docID:
return b.docID - a.docID;
}
});
// Holds all cells, sorted by closest to the point:
PriorityQueue<Cell> cellQueue = new PriorityQueue<>();
NearestVisitor visitor = new NearestVisitor(hitQueue, n, pointLat, pointLon);
List<BKDReader.IntersectState> states = new ArrayList<>();
// Add root cell for each reader into the queue:
int bytesPerDim = -1;
for (int i = 0; i < readers.size(); i++) {
BKDReader reader = readers.get(i);
if (bytesPerDim == -1) {
bytesPerDim = reader.getBytesPerDimension();
} else if (bytesPerDim != reader.getBytesPerDimension()) {
throw new IllegalStateException("bytesPerDim changed from " + bytesPerDim + " to " + reader.getBytesPerDimension() + " across readers");
}
byte[] minPackedValue = reader.getMinPackedValue();
byte[] maxPackedValue = reader.getMaxPackedValue();
IntersectState state = reader.getIntersectState(visitor);
states.add(state);
cellQueue.offer(new Cell(state.index, i, reader.getMinPackedValue(), reader.getMaxPackedValue(), approxBestDistance(minPackedValue, maxPackedValue, pointLat, pointLon)));
}
while (cellQueue.size() > 0) {
Cell cell = cellQueue.poll();
//System.out.println(" visit " + cell);
// TODO: if we replace approxBestDistance with actualBestDistance, we can put an opto here to break once this "best" cell is fully outside of the hitQueue bottom's radius:
BKDReader reader = readers.get(cell.readerIndex);
if (cell.index.isLeafNode()) {
//System.out.println(" leaf");
// Leaf block: visit all points and possibly collect them:
visitor.curDocBase = docBases.get(cell.readerIndex);
visitor.curLiveDocs = liveDocs.get(cell.readerIndex);
reader.visitLeafBlockValues(cell.index, states.get(cell.readerIndex));
//System.out.println(" now " + hitQueue.size() + " hits");
} else {
//System.out.println(" non-leaf");
// Non-leaf block: split into two cells and put them back into the queue:
double cellMinLat = decodeLatitude(cell.minPacked, 0);
double cellMinLon = decodeLongitude(cell.minPacked, Integer.BYTES);
double cellMaxLat = decodeLatitude(cell.maxPacked, 0);
double cellMaxLon = decodeLongitude(cell.maxPacked, Integer.BYTES);
if (cellMaxLat < visitor.minLat || visitor.maxLat < cellMinLat || ((cellMaxLon < visitor.minLon || visitor.maxLon < cellMinLon) && cellMaxLon < visitor.minLon2)) {
// this cell is outside our search bbox; don't bother exploring any more
continue;
}
BytesRef splitValue = BytesRef.deepCopyOf(cell.index.getSplitDimValue());
int splitDim = cell.index.getSplitDim();
// we must clone the index so that we we can recurse left and right "concurrently":
IndexTree newIndex = cell.index.clone();
byte[] splitPackedValue = cell.maxPacked.clone();
System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim * bytesPerDim, bytesPerDim);
cell.index.pushLeft();
cellQueue.offer(new Cell(cell.index, cell.readerIndex, cell.minPacked, splitPackedValue, approxBestDistance(cell.minPacked, splitPackedValue, pointLat, pointLon)));
splitPackedValue = cell.minPacked.clone();
System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim * bytesPerDim, bytesPerDim);
newIndex.pushRight();
cellQueue.offer(new Cell(newIndex, cell.readerIndex, splitPackedValue, cell.maxPacked, approxBestDistance(splitPackedValue, cell.maxPacked, pointLat, pointLon)));
}
}
NearestHit[] hits = new NearestHit[hitQueue.size()];
int downTo = hitQueue.size() - 1;
while (hitQueue.size() != 0) {
hits[downTo] = hitQueue.poll();
downTo--;
}
return hits;
}
use of java.util.PriorityQueue in project lucene-solr by apache.
the class WordBreakSpellChecker method suggestWordCombinations.
/**
* <p>
* Generate suggestions by combining one or more of the passed-in terms into
* single words. The returned {@link CombineSuggestion} contains both a
* {@link SuggestWord} and also an array detailing which passed-in terms were
* involved in creating this combination. The scores returned are equal to the
* number of word combinations needed, also one less than the length of the
* array {@link CombineSuggestion#originalTermIndexes}. Generally, a
* suggestion with a lower score is preferred over a higher score.
* </p>
* <p>
* To prevent two adjacent terms from being combined (for instance, if one is
* mandatory and the other is prohibited), separate the two terms with
* {@link WordBreakSpellChecker#SEPARATOR_TERM}
* </p>
* <p>
* When suggestMode equals {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}, each
* suggestion will include at least one term not in the index.
* </p>
* <p>
* When suggestMode equals {@link SuggestMode#SUGGEST_MORE_POPULAR}, each
* suggestion will have the same, or better frequency than the most-popular
* included term.
* </p>
*
* @return an array of words generated by combining original terms
* @throws IOException If there is a low-level I/O error.
*/
public CombineSuggestion[] suggestWordCombinations(Term[] terms, int maxSuggestions, IndexReader ir, SuggestMode suggestMode) throws IOException {
if (maxSuggestions < 1) {
return new CombineSuggestion[0];
}
int[] origFreqs = null;
if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
origFreqs = new int[terms.length];
for (int i = 0; i < terms.length; i++) {
origFreqs[i] = ir.docFreq(terms[i]);
}
}
int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
Comparator<CombineSuggestionWrapper> queueComparator = new CombinationsThenFreqComparator();
Queue<CombineSuggestionWrapper> suggestions = new PriorityQueue<>(queueInitialCapacity, queueComparator);
int thisTimeEvaluations = 0;
for (int i = 0; i < terms.length - 1; i++) {
if (terms[i].equals(SEPARATOR_TERM)) {
continue;
}
String leftTermText = terms[i].text();
int leftTermLength = leftTermText.codePointCount(0, leftTermText.length());
if (leftTermLength > maxCombineWordLength) {
continue;
}
int maxFreq = 0;
int minFreq = Integer.MAX_VALUE;
if (origFreqs != null) {
maxFreq = origFreqs[i];
minFreq = origFreqs[i];
}
String combinedTermText = leftTermText;
int combinedLength = leftTermLength;
for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
if (terms[j].equals(SEPARATOR_TERM)) {
break;
}
String rightTermText = terms[j].text();
int rightTermLength = rightTermText.codePointCount(0, rightTermText.length());
combinedTermText += rightTermText;
combinedLength += rightTermLength;
if (combinedLength > maxCombineWordLength) {
break;
}
if (origFreqs != null) {
maxFreq = Math.max(maxFreq, origFreqs[j]);
minFreq = Math.min(minFreq, origFreqs[j]);
}
Term combinedTerm = new Term(terms[0].field(), combinedTermText);
int combinedTermFreq = ir.docFreq(combinedTerm);
if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR || combinedTermFreq >= maxFreq) {
if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX || minFreq == 0) {
if (combinedTermFreq >= minSuggestionFrequency) {
int[] origIndexes = new int[j - i + 1];
origIndexes[0] = i;
for (int k = 1; k < origIndexes.length; k++) {
origIndexes[k] = i + k;
}
SuggestWord word = new SuggestWord();
word.freq = combinedTermFreq;
word.score = origIndexes.length - 1;
word.string = combinedTerm.text();
CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(new CombineSuggestion(word, origIndexes), (origIndexes.length - 1));
suggestions.offer(suggestion);
if (suggestions.size() > maxSuggestions) {
suggestions.poll();
}
}
}
}
thisTimeEvaluations++;
if (thisTimeEvaluations == maxEvaluations) {
break;
}
}
}
CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions.size()];
for (int i = suggestions.size() - 1; i >= 0; i--) {
combineSuggestions[i] = suggestions.remove().combineSuggestion;
}
return combineSuggestions;
}
use of java.util.PriorityQueue in project lucene-solr by apache.
the class DirectSpellChecker method suggestSimilar.
/**
* Provide spelling corrections based on several parameters.
*
* @param term The term to suggest spelling corrections for
* @param numSug The maximum number of spelling corrections
* @param ir The index reader to fetch the candidate spelling corrections from
* @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
* @param editDistance The maximum edit distance candidates are allowed to have
* @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
* @param spare a chars scratch
* @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
* @throws IOException If I/O related errors occur
*/
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRefBuilder spare) throws IOException {
AttributeSource atts = new AttributeSource();
MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
Terms terms = MultiFields.getTerms(ir, term.field());
if (terms == null) {
return Collections.emptyList();
}
FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance - 1), true);
final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
BytesRef queryTerm = new BytesRef(term.text());
BytesRef candidateTerm;
ScoreTerm st = new ScoreTerm();
BoostAttribute boostAtt = e.attributes().addAttribute(BoostAttribute.class);
while ((candidateTerm = e.next()) != null) {
// For FuzzyQuery, boost is the score:
float score = boostAtt.getBoost();
// ignore uncompetitive hits
if (stQueue.size() >= numSug && score <= stQueue.peek().boost) {
continue;
}
// ignore exact match of the same term
if (queryTerm.bytesEquals(candidateTerm)) {
continue;
}
int df = e.docFreq();
// check docFreq if required
if (df <= docfreq) {
continue;
}
final String termAsString;
if (distance == INTERNAL_LEVENSHTEIN) {
// delay creating strings until the end
termAsString = null;
} else {
spare.copyUTF8Bytes(candidateTerm);
termAsString = spare.toString();
score = distance.getDistance(term.text(), termAsString);
}
if (score < accuracy) {
continue;
}
// add new entry in PQ
st.term = BytesRef.deepCopyOf(candidateTerm);
st.boost = score;
st.docfreq = df;
st.termAsString = termAsString;
st.score = score;
stQueue.offer(st);
// possibly drop entries from queue
st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
}
return stQueue;
}
use of java.util.PriorityQueue in project lucene-solr by apache.
the class FieldHighlighter method highlightOffsetsEnums.
// algorithm: treat sentence snippets as miniature documents
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
protected Passage[] highlightOffsetsEnums(List<OffsetsEnum> offsetsEnums) throws IOException {
PassageScorer scorer = passageScorer;
BreakIterator breakIterator = this.breakIterator;
final int contentLength = breakIterator.getText().getEndIndex();
PriorityQueue<OffsetsEnum> offsetsEnumQueue = new PriorityQueue<>(offsetsEnums.size() + 1);
for (OffsetsEnum off : offsetsEnums) {
off.setWeight(scorer.weight(contentLength, off.freq()));
// go to first position
off.nextPosition();
offsetsEnumQueue.add(off);
}
// a sentinel for termination
offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY));
PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
if (left.getScore() < right.getScore()) {
return -1;
} else if (left.getScore() > right.getScore()) {
return 1;
} else {
return left.getStartOffset() - right.getStartOffset();
}
});
// the current passage in-progress. Will either get reset or added to queue.
Passage passage = new Passage();
OffsetsEnum off;
while ((off = offsetsEnumQueue.poll()) != null) {
int start = off.startOffset();
if (start == -1) {
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
}
int end = off.endOffset();
// saw this term, it won't cause a passage to be added to passageQueue or anything.
assert EMPTY.startOffset() == Integer.MAX_VALUE;
if (start < contentLength && end > contentLength) {
continue;
}
// See if this term should be part of a new passage.
if (start >= passage.getEndOffset()) {
if (passage.getStartOffset() >= 0) {
// true if this passage has terms; otherwise couldn't find any (yet)
// finalize passage
passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset()));
// new sentence: first add 'passage' to queue
if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) {
// can't compete, just reset it
passage.reset();
} else {
passageQueue.offer(passage);
if (passageQueue.size() > maxPassages) {
passage = passageQueue.poll();
passage.reset();
} else {
passage = new Passage();
}
}
}
// if we exceed limit, we are done
if (start >= contentLength) {
break;
}
// advance breakIterator
passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0));
passage.setEndOffset(Math.min(breakIterator.following(start), contentLength));
}
// Add this term to the passage.
int tf = 0;
while (true) {
tf++;
// a reference; safe to refer to
BytesRef term = off.getTerm();
assert term != null;
passage.addMatch(start, end, term);
// see if there are multiple occurrences of this term in this passage. If so, add them.
if (!off.hasMorePositions()) {
// No more in the entire text. Already removed from pq; move on
break;
}
off.nextPosition();
start = off.startOffset();
end = off.endOffset();
if (start >= passage.getEndOffset() || end > contentLength) {
// it's beyond this passage
offsetsEnumQueue.offer(off);
break;
}
}
passage.setScore(passage.getScore() + off.getWeight() * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset()));
}
Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
for (Passage p : passages) {
p.sort();
}
// sort in ascending order
Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset());
return passages;
}
use of java.util.PriorityQueue in project intellij-community by JetBrains.
the class LinearBekGraphBuilder method getFragment.
@Nullable
private MergeFragment getFragment(int leftChild, int rightChild, int parent) {
MergeFragment fragment = new MergeFragment(parent, leftChild, rightChild);
int leftLi = myGraphLayout.getLayoutIndex(leftChild);
int rightLi = myGraphLayout.getLayoutIndex(rightChild);
int rowsCount = 1;
int blockSize = 1;
PriorityQueue<GraphEdge> queue = new PriorityQueue<>(MAX_BLOCK_SIZE, new GraphEdgeComparator());
queue.addAll(myLinearBekGraph.getAdjacentEdges(rightChild, EdgeFilter.NORMAL_DOWN));
@Nullable Set<Integer> magicSet = null;
while (!queue.isEmpty()) {
GraphEdge nextEdge = queue.poll();
Integer next = nextEdge.getDownNodeIndex();
Integer upNodeIndex = nextEdge.getUpNodeIndex();
// can not happen
assert upNodeIndex != null;
if (next == null) {
fragment.addTail(upNodeIndex);
// allow very long edges down
continue;
}
if (next == leftChild) {
// found first child
fragment.addTail(upNodeIndex);
fragment.setMergeWithOldCommit(true);
} else if (next == rightChild + rowsCount) {
// all is fine, continuing
rowsCount++;
blockSize++;
queue.addAll(myLinearBekGraph.getAdjacentEdges(next, EdgeFilter.NORMAL_DOWN));
fragment.addBody(upNodeIndex);
} else if (next > rightChild + rowsCount && next < leftChild) {
rowsCount = next - rightChild + 1;
blockSize++;
queue.addAll(myLinearBekGraph.getAdjacentEdges(next, EdgeFilter.NORMAL_DOWN));
fragment.addBody(upNodeIndex);
} else if (next > leftChild) {
int li = myGraphLayout.getLayoutIndex(next);
if (leftLi > rightLi && !fragment.isMergeWithOldCommit()) {
if (next > leftChild + MAGIC_SET_SIZE) {
return null;
}
if (magicSet == null) {
magicSet = calculateMagicSet(leftChild);
}
if (magicSet.contains(next)) {
fragment.addTailEdge(upNodeIndex, next);
} else {
return null;
}
} else {
if ((li > leftLi && li < rightLi) || (li == leftLi)) {
fragment.addTailEdge(upNodeIndex, next);
} else {
if (li >= rightLi) {
return null;
} else {
if (next > leftChild + MAGIC_SET_SIZE) {
if (!fragment.hasTailEdge(upNodeIndex) && !fragment.isBody(upNodeIndex))
return null;
} else {
if (magicSet == null) {
magicSet = calculateMagicSet(leftChild);
}
if (magicSet.contains(next)) {
fragment.addTailEdge(upNodeIndex, next);
} else {
return null;
}
}
}
}
}
}
if (blockSize >= MAX_BLOCK_SIZE) {
return null;
}
}
if (fragment.getTails().isEmpty()) {
// this can happen if we ran into initial import
return null;
}
return fragment;
}
Aggregations