Search in sources :

Example 26 with BreakIterator

use of java.text.BreakIterator in project lucene-solr by apache.

the class TestWholeBreakIterator method testFirstPosition.

/** the current position must be ignored, initial position is always first() */
public void testFirstPosition() throws Exception {
    BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
    BreakIterator actual = new WholeBreakIterator();
    assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
}
Also used : BreakIterator(java.text.BreakIterator)

Example 27 with BreakIterator

use of java.text.BreakIterator in project lucene-solr by apache.

the class BreakIteratorBoundaryScanner method get.

@Override
protected BoundaryScanner get(String fieldName, SolrParams params) {
    // construct Locale
    String language = params.getFieldParam(fieldName, HighlightParams.BS_LANGUAGE);
    String country = params.getFieldParam(fieldName, HighlightParams.BS_COUNTRY);
    if (country != null && language == null) {
        throw new SolrException(ErrorCode.BAD_REQUEST, HighlightParams.BS_LANGUAGE + " parameter cannot be null when you specify " + HighlightParams.BS_COUNTRY);
    }
    Locale locale = null;
    if (language != null) {
        locale = country == null ? new Locale(language) : new Locale(language, country);
    } else {
        locale = Locale.ROOT;
    }
    // construct BreakIterator
    String type = params.getFieldParam(fieldName, HighlightParams.BS_TYPE, "WORD").toLowerCase(Locale.ROOT);
    BreakIterator bi = null;
    if (type.equals("character")) {
        bi = BreakIterator.getCharacterInstance(locale);
    } else if (type.equals("word")) {
        bi = BreakIterator.getWordInstance(locale);
    } else if (type.equals("line")) {
        bi = BreakIterator.getLineInstance(locale);
    } else if (type.equals("sentence")) {
        bi = BreakIterator.getSentenceInstance(locale);
    } else
        throw new SolrException(ErrorCode.BAD_REQUEST, type + " is invalid for parameter " + HighlightParams.BS_TYPE);
    return new org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner(bi);
}
Also used : Locale(java.util.Locale) SolrException(org.apache.solr.common.SolrException) BreakIterator(java.text.BreakIterator)

Example 28 with BreakIterator

use of java.text.BreakIterator in project lucene-solr by apache.

the class FieldHighlighter method highlightOffsetsEnums.

// algorithm: treat sentence snippets as miniature documents
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
protected Passage[] highlightOffsetsEnums(List<OffsetsEnum> offsetsEnums) throws IOException {
    PassageScorer scorer = passageScorer;
    BreakIterator breakIterator = this.breakIterator;
    final int contentLength = breakIterator.getText().getEndIndex();
    PriorityQueue<OffsetsEnum> offsetsEnumQueue = new PriorityQueue<>(offsetsEnums.size() + 1);
    for (OffsetsEnum off : offsetsEnums) {
        off.setWeight(scorer.weight(contentLength, off.freq()));
        // go to first position
        off.nextPosition();
        offsetsEnumQueue.add(off);
    }
    // a sentinel for termination
    offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY));
    PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
        if (left.getScore() < right.getScore()) {
            return -1;
        } else if (left.getScore() > right.getScore()) {
            return 1;
        } else {
            return left.getStartOffset() - right.getStartOffset();
        }
    });
    // the current passage in-progress.  Will either get reset or added to queue.
    Passage passage = new Passage();
    OffsetsEnum off;
    while ((off = offsetsEnumQueue.poll()) != null) {
        int start = off.startOffset();
        if (start == -1) {
            throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
        }
        int end = off.endOffset();
        // saw this term, it won't cause a passage to be added to passageQueue or anything.
        assert EMPTY.startOffset() == Integer.MAX_VALUE;
        if (start < contentLength && end > contentLength) {
            continue;
        }
        // See if this term should be part of a new passage.
        if (start >= passage.getEndOffset()) {
            if (passage.getStartOffset() >= 0) {
                // true if this passage has terms; otherwise couldn't find any (yet)
                // finalize passage
                passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset()));
                // new sentence: first add 'passage' to queue
                if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) {
                    // can't compete, just reset it
                    passage.reset();
                } else {
                    passageQueue.offer(passage);
                    if (passageQueue.size() > maxPassages) {
                        passage = passageQueue.poll();
                        passage.reset();
                    } else {
                        passage = new Passage();
                    }
                }
            }
            // if we exceed limit, we are done
            if (start >= contentLength) {
                break;
            }
            // advance breakIterator
            passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0));
            passage.setEndOffset(Math.min(breakIterator.following(start), contentLength));
        }
        // Add this term to the passage.
        int tf = 0;
        while (true) {
            tf++;
            // a reference; safe to refer to
            BytesRef term = off.getTerm();
            assert term != null;
            passage.addMatch(start, end, term);
            // see if there are multiple occurrences of this term in this passage. If so, add them.
            if (!off.hasMorePositions()) {
                // No more in the entire text. Already removed from pq; move on
                break;
            }
            off.nextPosition();
            start = off.startOffset();
            end = off.endOffset();
            if (start >= passage.getEndOffset() || end > contentLength) {
                // it's beyond this passage
                offsetsEnumQueue.offer(off);
                break;
            }
        }
        passage.setScore(passage.getScore() + off.getWeight() * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset()));
    }
    Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
    for (Passage p : passages) {
        p.sort();
    }
    // sort in ascending order
    Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset());
    return passages;
}
Also used : PriorityQueue(java.util.PriorityQueue) BytesRef(org.apache.lucene.util.BytesRef) BreakIterator(java.text.BreakIterator)

Example 29 with BreakIterator

use of java.text.BreakIterator in project WordPress-Android by wordpress-mobile.

the class ReaderPost method extractTitle.

/*
     * extracts a title from a post's excerpt - used when the post has no title
     */
private static String extractTitle(final String excerpt, int maxLen) {
    if (TextUtils.isEmpty(excerpt))
        return null;
    if (excerpt.length() < maxLen)
        return excerpt.trim();
    StringBuilder result = new StringBuilder();
    BreakIterator wordIterator = BreakIterator.getWordInstance();
    wordIterator.setText(excerpt);
    int start = wordIterator.first();
    int end = wordIterator.next();
    int totalLen = 0;
    while (end != BreakIterator.DONE) {
        String word = excerpt.substring(start, end);
        result.append(word);
        totalLen += word.length();
        if (totalLen >= maxLen)
            break;
        start = end;
        end = wordIterator.next();
    }
    if (totalLen == 0)
        return null;
    return result.toString().trim() + "...";
}
Also used : BreakIterator(java.text.BreakIterator)

Example 30 with BreakIterator

use of java.text.BreakIterator in project WordPress-Android by wordpress-mobile.

the class PostUtils method makeExcerpt.

private static String makeExcerpt(String description) {
    if (TextUtils.isEmpty(description)) {
        return null;
    }
    String s = HtmlUtils.fastStripHtml(description);
    if (s.length() < MAX_EXCERPT_LEN) {
        return trimEx(s);
    }
    StringBuilder result = new StringBuilder();
    BreakIterator wordIterator = BreakIterator.getWordInstance();
    wordIterator.setText(s);
    int start = wordIterator.first();
    int end = wordIterator.next();
    int totalLen = 0;
    while (end != BreakIterator.DONE) {
        String word = s.substring(start, end);
        result.append(word);
        totalLen += word.length();
        if (totalLen >= MAX_EXCERPT_LEN) {
            break;
        }
        start = end;
        end = wordIterator.next();
    }
    if (totalLen == 0) {
        return null;
    }
    return trimEx(result.toString()) + "...";
}
Also used : BreakIterator(java.text.BreakIterator)

Aggregations

BreakIterator (java.text.BreakIterator)59 ArrayList (java.util.ArrayList)10 Locale (java.util.Locale)6 IntPair (edu.illinois.cs.cogcomp.core.datastructures.IntPair)3 BytesRef (org.apache.lucene.util.BytesRef)3 Snippet (org.apache.lucene.search.highlight.Snippet)2 Intent (android.content.Intent)1 TagElement (com.google.devtools.j2objc.ast.TagElement)1 Pair (edu.illinois.cs.cogcomp.core.datastructures.Pair)1 TextAnnotation (edu.illinois.cs.cogcomp.core.datastructures.textannotation.TextAnnotation)1 IOException (java.io.IOException)1 Iterator (java.util.Iterator)1 PriorityQueue (java.util.PriorityQueue)1 JComponent (javax.swing.JComponent)1 Text (org.apache.hadoop.io.Text)1 Analyzer (org.apache.lucene.analysis.Analyzer)1 IndexSearcher (org.apache.lucene.search.IndexSearcher)1 Encoder (org.apache.lucene.search.highlight.Encoder)1 CustomSeparatorBreakIterator (org.apache.lucene.search.postingshighlight.CustomSeparatorBreakIterator)1 CustomPassageFormatter (org.apache.lucene.search.uhighlight.CustomPassageFormatter)1