Search in sources :

Example 21 with PriorityQueue

use of java.util.PriorityQueue in project lucene-solr by apache.

the class NearestNeighbor method nearest.

// TODO: can we somehow share more with, or simply directly use, the LatLonPointDistanceComparator?  It's really doing the same thing as
// our hitQueue...
public static NearestHit[] nearest(double pointLat, double pointLon, List<BKDReader> readers, List<Bits> liveDocs, List<Integer> docBases, final int n) throws IOException {
    //System.out.println("NEAREST: readers=" + readers + " liveDocs=" + liveDocs + " pointLat=" + pointLat + " pointLon=" + pointLon);
    // Holds closest collected points seen so far:
    // TODO: if we used lucene's PQ we could just updateTop instead of poll/offer:
    final PriorityQueue<NearestHit> hitQueue = new PriorityQueue<>(n, new Comparator<NearestHit>() {

        @Override
        public int compare(NearestHit a, NearestHit b) {
            // sort by opposite distanceMeters natural order
            int cmp = Double.compare(a.distanceMeters, b.distanceMeters);
            if (cmp != 0) {
                return -cmp;
            }
            // tie-break by higher docID:
            return b.docID - a.docID;
        }
    });
    // Holds all cells, sorted by closest to the point:
    PriorityQueue<Cell> cellQueue = new PriorityQueue<>();
    NearestVisitor visitor = new NearestVisitor(hitQueue, n, pointLat, pointLon);
    List<BKDReader.IntersectState> states = new ArrayList<>();
    // Add root cell for each reader into the queue:
    int bytesPerDim = -1;
    for (int i = 0; i < readers.size(); i++) {
        BKDReader reader = readers.get(i);
        if (bytesPerDim == -1) {
            bytesPerDim = reader.getBytesPerDimension();
        } else if (bytesPerDim != reader.getBytesPerDimension()) {
            throw new IllegalStateException("bytesPerDim changed from " + bytesPerDim + " to " + reader.getBytesPerDimension() + " across readers");
        }
        byte[] minPackedValue = reader.getMinPackedValue();
        byte[] maxPackedValue = reader.getMaxPackedValue();
        IntersectState state = reader.getIntersectState(visitor);
        states.add(state);
        cellQueue.offer(new Cell(state.index, i, reader.getMinPackedValue(), reader.getMaxPackedValue(), approxBestDistance(minPackedValue, maxPackedValue, pointLat, pointLon)));
    }
    while (cellQueue.size() > 0) {
        Cell cell = cellQueue.poll();
        //System.out.println("  visit " + cell);
        // TODO: if we replace approxBestDistance with actualBestDistance, we can put an opto here to break once this "best" cell is fully outside of the hitQueue bottom's radius:
        BKDReader reader = readers.get(cell.readerIndex);
        if (cell.index.isLeafNode()) {
            //System.out.println("    leaf");
            // Leaf block: visit all points and possibly collect them:
            visitor.curDocBase = docBases.get(cell.readerIndex);
            visitor.curLiveDocs = liveDocs.get(cell.readerIndex);
            reader.visitLeafBlockValues(cell.index, states.get(cell.readerIndex));
        //System.out.println("    now " + hitQueue.size() + " hits");
        } else {
            //System.out.println("    non-leaf");
            // Non-leaf block: split into two cells and put them back into the queue:
            double cellMinLat = decodeLatitude(cell.minPacked, 0);
            double cellMinLon = decodeLongitude(cell.minPacked, Integer.BYTES);
            double cellMaxLat = decodeLatitude(cell.maxPacked, 0);
            double cellMaxLon = decodeLongitude(cell.maxPacked, Integer.BYTES);
            if (cellMaxLat < visitor.minLat || visitor.maxLat < cellMinLat || ((cellMaxLon < visitor.minLon || visitor.maxLon < cellMinLon) && cellMaxLon < visitor.minLon2)) {
                // this cell is outside our search bbox; don't bother exploring any more
                continue;
            }
            BytesRef splitValue = BytesRef.deepCopyOf(cell.index.getSplitDimValue());
            int splitDim = cell.index.getSplitDim();
            // we must clone the index so that we we can recurse left and right "concurrently":
            IndexTree newIndex = cell.index.clone();
            byte[] splitPackedValue = cell.maxPacked.clone();
            System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim * bytesPerDim, bytesPerDim);
            cell.index.pushLeft();
            cellQueue.offer(new Cell(cell.index, cell.readerIndex, cell.minPacked, splitPackedValue, approxBestDistance(cell.minPacked, splitPackedValue, pointLat, pointLon)));
            splitPackedValue = cell.minPacked.clone();
            System.arraycopy(splitValue.bytes, splitValue.offset, splitPackedValue, splitDim * bytesPerDim, bytesPerDim);
            newIndex.pushRight();
            cellQueue.offer(new Cell(newIndex, cell.readerIndex, splitPackedValue, cell.maxPacked, approxBestDistance(splitPackedValue, cell.maxPacked, pointLat, pointLon)));
        }
    }
    NearestHit[] hits = new NearestHit[hitQueue.size()];
    int downTo = hitQueue.size() - 1;
    while (hitQueue.size() != 0) {
        hits[downTo] = hitQueue.poll();
        downTo--;
    }
    return hits;
}
Also used : IndexTree(org.apache.lucene.util.bkd.BKDReader.IndexTree) IntersectState(org.apache.lucene.util.bkd.BKDReader.IntersectState) ArrayList(java.util.ArrayList) PriorityQueue(java.util.PriorityQueue) BKDReader(org.apache.lucene.util.bkd.BKDReader) BytesRef(org.apache.lucene.util.BytesRef)

Example 22 with PriorityQueue

use of java.util.PriorityQueue in project lucene-solr by apache.

the class WordBreakSpellChecker method suggestWordCombinations.

/**
   * <p>
   * Generate suggestions by combining one or more of the passed-in terms into
   * single words. The returned {@link CombineSuggestion} contains both a
   * {@link SuggestWord} and also an array detailing which passed-in terms were
   * involved in creating this combination. The scores returned are equal to the
   * number of word combinations needed, also one less than the length of the
   * array {@link CombineSuggestion#originalTermIndexes}. Generally, a
   * suggestion with a lower score is preferred over a higher score.
   * </p>
   * <p>
   * To prevent two adjacent terms from being combined (for instance, if one is
   * mandatory and the other is prohibited), separate the two terms with
   * {@link WordBreakSpellChecker#SEPARATOR_TERM}
   * </p>
   * <p>
   * When suggestMode equals {@link SuggestMode#SUGGEST_WHEN_NOT_IN_INDEX}, each
   * suggestion will include at least one term not in the index.
   * </p>
   * <p>
   * When suggestMode equals {@link SuggestMode#SUGGEST_MORE_POPULAR}, each
   * suggestion will have the same, or better frequency than the most-popular
   * included term.
   * </p>
   * 
   * @return an array of words generated by combining original terms
   * @throws IOException If there is a low-level I/O error.
   */
public CombineSuggestion[] suggestWordCombinations(Term[] terms, int maxSuggestions, IndexReader ir, SuggestMode suggestMode) throws IOException {
    if (maxSuggestions < 1) {
        return new CombineSuggestion[0];
    }
    int[] origFreqs = null;
    if (suggestMode != SuggestMode.SUGGEST_ALWAYS) {
        origFreqs = new int[terms.length];
        for (int i = 0; i < terms.length; i++) {
            origFreqs[i] = ir.docFreq(terms[i]);
        }
    }
    int queueInitialCapacity = maxSuggestions > 10 ? 10 : maxSuggestions;
    Comparator<CombineSuggestionWrapper> queueComparator = new CombinationsThenFreqComparator();
    Queue<CombineSuggestionWrapper> suggestions = new PriorityQueue<>(queueInitialCapacity, queueComparator);
    int thisTimeEvaluations = 0;
    for (int i = 0; i < terms.length - 1; i++) {
        if (terms[i].equals(SEPARATOR_TERM)) {
            continue;
        }
        String leftTermText = terms[i].text();
        int leftTermLength = leftTermText.codePointCount(0, leftTermText.length());
        if (leftTermLength > maxCombineWordLength) {
            continue;
        }
        int maxFreq = 0;
        int minFreq = Integer.MAX_VALUE;
        if (origFreqs != null) {
            maxFreq = origFreqs[i];
            minFreq = origFreqs[i];
        }
        String combinedTermText = leftTermText;
        int combinedLength = leftTermLength;
        for (int j = i + 1; j < terms.length && j - i <= maxChanges; j++) {
            if (terms[j].equals(SEPARATOR_TERM)) {
                break;
            }
            String rightTermText = terms[j].text();
            int rightTermLength = rightTermText.codePointCount(0, rightTermText.length());
            combinedTermText += rightTermText;
            combinedLength += rightTermLength;
            if (combinedLength > maxCombineWordLength) {
                break;
            }
            if (origFreqs != null) {
                maxFreq = Math.max(maxFreq, origFreqs[j]);
                minFreq = Math.min(minFreq, origFreqs[j]);
            }
            Term combinedTerm = new Term(terms[0].field(), combinedTermText);
            int combinedTermFreq = ir.docFreq(combinedTerm);
            if (suggestMode != SuggestMode.SUGGEST_MORE_POPULAR || combinedTermFreq >= maxFreq) {
                if (suggestMode != SuggestMode.SUGGEST_WHEN_NOT_IN_INDEX || minFreq == 0) {
                    if (combinedTermFreq >= minSuggestionFrequency) {
                        int[] origIndexes = new int[j - i + 1];
                        origIndexes[0] = i;
                        for (int k = 1; k < origIndexes.length; k++) {
                            origIndexes[k] = i + k;
                        }
                        SuggestWord word = new SuggestWord();
                        word.freq = combinedTermFreq;
                        word.score = origIndexes.length - 1;
                        word.string = combinedTerm.text();
                        CombineSuggestionWrapper suggestion = new CombineSuggestionWrapper(new CombineSuggestion(word, origIndexes), (origIndexes.length - 1));
                        suggestions.offer(suggestion);
                        if (suggestions.size() > maxSuggestions) {
                            suggestions.poll();
                        }
                    }
                }
            }
            thisTimeEvaluations++;
            if (thisTimeEvaluations == maxEvaluations) {
                break;
            }
        }
    }
    CombineSuggestion[] combineSuggestions = new CombineSuggestion[suggestions.size()];
    for (int i = suggestions.size() - 1; i >= 0; i--) {
        combineSuggestions[i] = suggestions.remove().combineSuggestion;
    }
    return combineSuggestions;
}
Also used : Term(org.apache.lucene.index.Term) PriorityQueue(java.util.PriorityQueue)

Example 23 with PriorityQueue

use of java.util.PriorityQueue in project lucene-solr by apache.

the class DirectSpellChecker method suggestSimilar.

/**
   * Provide spelling corrections based on several parameters.
   *
   * @param term The term to suggest spelling corrections for
   * @param numSug The maximum number of spelling corrections
   * @param ir The index reader to fetch the candidate spelling corrections from
   * @param docfreq The minimum document frequency a potential suggestion need to have in order to be included
   * @param editDistance The maximum edit distance candidates are allowed to have
   * @param accuracy The minimum accuracy a suggested spelling correction needs to have in order to be included
   * @param spare a chars scratch
   * @return a collection of spelling corrections sorted by <code>ScoreTerm</code>'s natural order.
   * @throws IOException If I/O related errors occur
   */
protected Collection<ScoreTerm> suggestSimilar(Term term, int numSug, IndexReader ir, int docfreq, int editDistance, float accuracy, final CharsRefBuilder spare) throws IOException {
    AttributeSource atts = new AttributeSource();
    MaxNonCompetitiveBoostAttribute maxBoostAtt = atts.addAttribute(MaxNonCompetitiveBoostAttribute.class);
    Terms terms = MultiFields.getTerms(ir, term.field());
    if (terms == null) {
        return Collections.emptyList();
    }
    FuzzyTermsEnum e = new FuzzyTermsEnum(terms, atts, term, editDistance, Math.max(minPrefix, editDistance - 1), true);
    final PriorityQueue<ScoreTerm> stQueue = new PriorityQueue<>();
    BytesRef queryTerm = new BytesRef(term.text());
    BytesRef candidateTerm;
    ScoreTerm st = new ScoreTerm();
    BoostAttribute boostAtt = e.attributes().addAttribute(BoostAttribute.class);
    while ((candidateTerm = e.next()) != null) {
        // For FuzzyQuery, boost is the score:
        float score = boostAtt.getBoost();
        // ignore uncompetitive hits
        if (stQueue.size() >= numSug && score <= stQueue.peek().boost) {
            continue;
        }
        // ignore exact match of the same term
        if (queryTerm.bytesEquals(candidateTerm)) {
            continue;
        }
        int df = e.docFreq();
        // check docFreq if required
        if (df <= docfreq) {
            continue;
        }
        final String termAsString;
        if (distance == INTERNAL_LEVENSHTEIN) {
            // delay creating strings until the end
            termAsString = null;
        } else {
            spare.copyUTF8Bytes(candidateTerm);
            termAsString = spare.toString();
            score = distance.getDistance(term.text(), termAsString);
        }
        if (score < accuracy) {
            continue;
        }
        // add new entry in PQ
        st.term = BytesRef.deepCopyOf(candidateTerm);
        st.boost = score;
        st.docfreq = df;
        st.termAsString = termAsString;
        st.score = score;
        stQueue.offer(st);
        // possibly drop entries from queue
        st = (stQueue.size() > numSug) ? stQueue.poll() : new ScoreTerm();
        maxBoostAtt.setMaxNonCompetitiveBoost((stQueue.size() >= numSug) ? stQueue.peek().boost : Float.NEGATIVE_INFINITY);
    }
    return stQueue;
}
Also used : AttributeSource(org.apache.lucene.util.AttributeSource) FuzzyTermsEnum(org.apache.lucene.search.FuzzyTermsEnum) Terms(org.apache.lucene.index.Terms) BoostAttribute(org.apache.lucene.search.BoostAttribute) MaxNonCompetitiveBoostAttribute(org.apache.lucene.search.MaxNonCompetitiveBoostAttribute) MaxNonCompetitiveBoostAttribute(org.apache.lucene.search.MaxNonCompetitiveBoostAttribute) PriorityQueue(java.util.PriorityQueue) BytesRef(org.apache.lucene.util.BytesRef)

Example 24 with PriorityQueue

use of java.util.PriorityQueue in project lucene-solr by apache.

the class FieldHighlighter method highlightOffsetsEnums.

// algorithm: treat sentence snippets as miniature documents
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
protected Passage[] highlightOffsetsEnums(List<OffsetsEnum> offsetsEnums) throws IOException {
    PassageScorer scorer = passageScorer;
    BreakIterator breakIterator = this.breakIterator;
    final int contentLength = breakIterator.getText().getEndIndex();
    PriorityQueue<OffsetsEnum> offsetsEnumQueue = new PriorityQueue<>(offsetsEnums.size() + 1);
    for (OffsetsEnum off : offsetsEnums) {
        off.setWeight(scorer.weight(contentLength, off.freq()));
        // go to first position
        off.nextPosition();
        offsetsEnumQueue.add(off);
    }
    // a sentinel for termination
    offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY));
    PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
        if (left.getScore() < right.getScore()) {
            return -1;
        } else if (left.getScore() > right.getScore()) {
            return 1;
        } else {
            return left.getStartOffset() - right.getStartOffset();
        }
    });
    // the current passage in-progress.  Will either get reset or added to queue.
    Passage passage = new Passage();
    OffsetsEnum off;
    while ((off = offsetsEnumQueue.poll()) != null) {
        int start = off.startOffset();
        if (start == -1) {
            throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
        }
        int end = off.endOffset();
        // saw this term, it won't cause a passage to be added to passageQueue or anything.
        assert EMPTY.startOffset() == Integer.MAX_VALUE;
        if (start < contentLength && end > contentLength) {
            continue;
        }
        // See if this term should be part of a new passage.
        if (start >= passage.getEndOffset()) {
            if (passage.getStartOffset() >= 0) {
                // true if this passage has terms; otherwise couldn't find any (yet)
                // finalize passage
                passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset()));
                // new sentence: first add 'passage' to queue
                if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) {
                    // can't compete, just reset it
                    passage.reset();
                } else {
                    passageQueue.offer(passage);
                    if (passageQueue.size() > maxPassages) {
                        passage = passageQueue.poll();
                        passage.reset();
                    } else {
                        passage = new Passage();
                    }
                }
            }
            // if we exceed limit, we are done
            if (start >= contentLength) {
                break;
            }
            // advance breakIterator
            passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0));
            passage.setEndOffset(Math.min(breakIterator.following(start), contentLength));
        }
        // Add this term to the passage.
        int tf = 0;
        while (true) {
            tf++;
            // a reference; safe to refer to
            BytesRef term = off.getTerm();
            assert term != null;
            passage.addMatch(start, end, term);
            // see if there are multiple occurrences of this term in this passage. If so, add them.
            if (!off.hasMorePositions()) {
                // No more in the entire text. Already removed from pq; move on
                break;
            }
            off.nextPosition();
            start = off.startOffset();
            end = off.endOffset();
            if (start >= passage.getEndOffset() || end > contentLength) {
                // it's beyond this passage
                offsetsEnumQueue.offer(off);
                break;
            }
        }
        passage.setScore(passage.getScore() + off.getWeight() * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset()));
    }
    Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
    for (Passage p : passages) {
        p.sort();
    }
    // sort in ascending order
    Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset());
    return passages;
}
Also used : PriorityQueue(java.util.PriorityQueue) BytesRef(org.apache.lucene.util.BytesRef) BreakIterator(java.text.BreakIterator)

Example 25 with PriorityQueue

use of java.util.PriorityQueue in project intellij-community by JetBrains.

the class LinearBekGraphBuilder method getFragment.

@Nullable
private MergeFragment getFragment(int leftChild, int rightChild, int parent) {
    MergeFragment fragment = new MergeFragment(parent, leftChild, rightChild);
    int leftLi = myGraphLayout.getLayoutIndex(leftChild);
    int rightLi = myGraphLayout.getLayoutIndex(rightChild);
    int rowsCount = 1;
    int blockSize = 1;
    PriorityQueue<GraphEdge> queue = new PriorityQueue<>(MAX_BLOCK_SIZE, new GraphEdgeComparator());
    queue.addAll(myLinearBekGraph.getAdjacentEdges(rightChild, EdgeFilter.NORMAL_DOWN));
    @Nullable Set<Integer> magicSet = null;
    while (!queue.isEmpty()) {
        GraphEdge nextEdge = queue.poll();
        Integer next = nextEdge.getDownNodeIndex();
        Integer upNodeIndex = nextEdge.getUpNodeIndex();
        // can not happen
        assert upNodeIndex != null;
        if (next == null) {
            fragment.addTail(upNodeIndex);
            // allow very long edges down
            continue;
        }
        if (next == leftChild) {
            // found first child
            fragment.addTail(upNodeIndex);
            fragment.setMergeWithOldCommit(true);
        } else if (next == rightChild + rowsCount) {
            // all is fine, continuing
            rowsCount++;
            blockSize++;
            queue.addAll(myLinearBekGraph.getAdjacentEdges(next, EdgeFilter.NORMAL_DOWN));
            fragment.addBody(upNodeIndex);
        } else if (next > rightChild + rowsCount && next < leftChild) {
            rowsCount = next - rightChild + 1;
            blockSize++;
            queue.addAll(myLinearBekGraph.getAdjacentEdges(next, EdgeFilter.NORMAL_DOWN));
            fragment.addBody(upNodeIndex);
        } else if (next > leftChild) {
            int li = myGraphLayout.getLayoutIndex(next);
            if (leftLi > rightLi && !fragment.isMergeWithOldCommit()) {
                if (next > leftChild + MAGIC_SET_SIZE) {
                    return null;
                }
                if (magicSet == null) {
                    magicSet = calculateMagicSet(leftChild);
                }
                if (magicSet.contains(next)) {
                    fragment.addTailEdge(upNodeIndex, next);
                } else {
                    return null;
                }
            } else {
                if ((li > leftLi && li < rightLi) || (li == leftLi)) {
                    fragment.addTailEdge(upNodeIndex, next);
                } else {
                    if (li >= rightLi) {
                        return null;
                    } else {
                        if (next > leftChild + MAGIC_SET_SIZE) {
                            if (!fragment.hasTailEdge(upNodeIndex) && !fragment.isBody(upNodeIndex))
                                return null;
                        } else {
                            if (magicSet == null) {
                                magicSet = calculateMagicSet(leftChild);
                            }
                            if (magicSet.contains(next)) {
                                fragment.addTailEdge(upNodeIndex, next);
                            } else {
                                return null;
                            }
                        }
                    }
                }
            }
        }
        if (blockSize >= MAX_BLOCK_SIZE) {
            return null;
        }
    }
    if (fragment.getTails().isEmpty()) {
        // this can happen if we ran into initial import
        return null;
    }
    return fragment;
}
Also used : PriorityQueue(java.util.PriorityQueue) GraphEdge(com.intellij.vcs.log.graph.api.elements.GraphEdge) Nullable(org.jetbrains.annotations.Nullable) Nullable(org.jetbrains.annotations.Nullable)

Aggregations

PriorityQueue (java.util.PriorityQueue)51 ArrayList (java.util.ArrayList)16 List (java.util.List)10 Map (java.util.Map)9 HashMap (java.util.HashMap)7 LinkedList (java.util.LinkedList)5 File (java.io.File)4 IOException (java.io.IOException)4 Entry (java.util.Map.Entry)4 Random (java.util.Random)4 BytesRef (org.apache.lucene.util.BytesRef)4 AbstractMapTable (com.ctriposs.sdb.table.AbstractMapTable)3 ScoredObject (edu.stanford.nlp.util.ScoredObject)3 Comparator (java.util.Comparator)3 Set (java.util.Set)3 FCMapTable (com.ctriposs.sdb.table.FCMapTable)2 HashMapTable (com.ctriposs.sdb.table.HashMapTable)2 IMapEntry (com.ctriposs.sdb.table.IMapEntry)2 MMFMapTable (com.ctriposs.sdb.table.MMFMapTable)2 Type (com.facebook.presto.spi.type.Type)2