use of java.text.BreakIterator in project lucene-solr by apache.
the class TestWholeBreakIterator method testFirstPosition.
/** the current position must be ignored, initial position is always first() */
public void testFirstPosition() throws Exception {
BreakIterator expected = BreakIterator.getSentenceInstance(Locale.ROOT);
BreakIterator actual = new WholeBreakIterator();
assertSameBreaks("000ab000", 3, 2, 4, expected, actual);
}
use of java.text.BreakIterator in project lucene-solr by apache.
the class BreakIteratorBoundaryScanner method get.
@Override
protected BoundaryScanner get(String fieldName, SolrParams params) {
// construct Locale
String language = params.getFieldParam(fieldName, HighlightParams.BS_LANGUAGE);
String country = params.getFieldParam(fieldName, HighlightParams.BS_COUNTRY);
if (country != null && language == null) {
throw new SolrException(ErrorCode.BAD_REQUEST, HighlightParams.BS_LANGUAGE + " parameter cannot be null when you specify " + HighlightParams.BS_COUNTRY);
}
Locale locale = null;
if (language != null) {
locale = country == null ? new Locale(language) : new Locale(language, country);
} else {
locale = Locale.ROOT;
}
// construct BreakIterator
String type = params.getFieldParam(fieldName, HighlightParams.BS_TYPE, "WORD").toLowerCase(Locale.ROOT);
BreakIterator bi = null;
if (type.equals("character")) {
bi = BreakIterator.getCharacterInstance(locale);
} else if (type.equals("word")) {
bi = BreakIterator.getWordInstance(locale);
} else if (type.equals("line")) {
bi = BreakIterator.getLineInstance(locale);
} else if (type.equals("sentence")) {
bi = BreakIterator.getSentenceInstance(locale);
} else
throw new SolrException(ErrorCode.BAD_REQUEST, type + " is invalid for parameter " + HighlightParams.BS_TYPE);
return new org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner(bi);
}
use of java.text.BreakIterator in project lucene-solr by apache.
the class FieldHighlighter method highlightOffsetsEnums.
// algorithm: treat sentence snippets as miniature documents
// we can intersect these with the postings lists via BreakIterator.preceding(offset),s
// score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
protected Passage[] highlightOffsetsEnums(List<OffsetsEnum> offsetsEnums) throws IOException {
PassageScorer scorer = passageScorer;
BreakIterator breakIterator = this.breakIterator;
final int contentLength = breakIterator.getText().getEndIndex();
PriorityQueue<OffsetsEnum> offsetsEnumQueue = new PriorityQueue<>(offsetsEnums.size() + 1);
for (OffsetsEnum off : offsetsEnums) {
off.setWeight(scorer.weight(contentLength, off.freq()));
// go to first position
off.nextPosition();
offsetsEnumQueue.add(off);
}
// a sentinel for termination
offsetsEnumQueue.add(new OffsetsEnum(null, EMPTY));
PriorityQueue<Passage> passageQueue = new PriorityQueue<>(Math.min(64, maxPassages + 1), (left, right) -> {
if (left.getScore() < right.getScore()) {
return -1;
} else if (left.getScore() > right.getScore()) {
return 1;
} else {
return left.getStartOffset() - right.getStartOffset();
}
});
// the current passage in-progress. Will either get reset or added to queue.
Passage passage = new Passage();
OffsetsEnum off;
while ((off = offsetsEnumQueue.poll()) != null) {
int start = off.startOffset();
if (start == -1) {
throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
}
int end = off.endOffset();
// saw this term, it won't cause a passage to be added to passageQueue or anything.
assert EMPTY.startOffset() == Integer.MAX_VALUE;
if (start < contentLength && end > contentLength) {
continue;
}
// See if this term should be part of a new passage.
if (start >= passage.getEndOffset()) {
if (passage.getStartOffset() >= 0) {
// true if this passage has terms; otherwise couldn't find any (yet)
// finalize passage
passage.setScore(passage.getScore() * scorer.norm(passage.getStartOffset()));
// new sentence: first add 'passage' to queue
if (passageQueue.size() == maxPassages && passage.getScore() < passageQueue.peek().getScore()) {
// can't compete, just reset it
passage.reset();
} else {
passageQueue.offer(passage);
if (passageQueue.size() > maxPassages) {
passage = passageQueue.poll();
passage.reset();
} else {
passage = new Passage();
}
}
}
// if we exceed limit, we are done
if (start >= contentLength) {
break;
}
// advance breakIterator
passage.setStartOffset(Math.max(breakIterator.preceding(start + 1), 0));
passage.setEndOffset(Math.min(breakIterator.following(start), contentLength));
}
// Add this term to the passage.
int tf = 0;
while (true) {
tf++;
// a reference; safe to refer to
BytesRef term = off.getTerm();
assert term != null;
passage.addMatch(start, end, term);
// see if there are multiple occurrences of this term in this passage. If so, add them.
if (!off.hasMorePositions()) {
// No more in the entire text. Already removed from pq; move on
break;
}
off.nextPosition();
start = off.startOffset();
end = off.endOffset();
if (start >= passage.getEndOffset() || end > contentLength) {
// it's beyond this passage
offsetsEnumQueue.offer(off);
break;
}
}
passage.setScore(passage.getScore() + off.getWeight() * scorer.tf(tf, passage.getEndOffset() - passage.getStartOffset()));
}
Passage[] passages = passageQueue.toArray(new Passage[passageQueue.size()]);
for (Passage p : passages) {
p.sort();
}
// sort in ascending order
Arrays.sort(passages, (left, right) -> left.getStartOffset() - right.getStartOffset());
return passages;
}
use of java.text.BreakIterator in project WordPress-Android by wordpress-mobile.
the class ReaderPost method extractTitle.
/*
* extracts a title from a post's excerpt - used when the post has no title
*/
private static String extractTitle(final String excerpt, int maxLen) {
if (TextUtils.isEmpty(excerpt))
return null;
if (excerpt.length() < maxLen)
return excerpt.trim();
StringBuilder result = new StringBuilder();
BreakIterator wordIterator = BreakIterator.getWordInstance();
wordIterator.setText(excerpt);
int start = wordIterator.first();
int end = wordIterator.next();
int totalLen = 0;
while (end != BreakIterator.DONE) {
String word = excerpt.substring(start, end);
result.append(word);
totalLen += word.length();
if (totalLen >= maxLen)
break;
start = end;
end = wordIterator.next();
}
if (totalLen == 0)
return null;
return result.toString().trim() + "...";
}
use of java.text.BreakIterator in project WordPress-Android by wordpress-mobile.
the class PostUtils method makeExcerpt.
private static String makeExcerpt(String description) {
if (TextUtils.isEmpty(description)) {
return null;
}
String s = HtmlUtils.fastStripHtml(description);
if (s.length() < MAX_EXCERPT_LEN) {
return trimEx(s);
}
StringBuilder result = new StringBuilder();
BreakIterator wordIterator = BreakIterator.getWordInstance();
wordIterator.setText(s);
int start = wordIterator.first();
int end = wordIterator.next();
int totalLen = 0;
while (end != BreakIterator.DONE) {
String word = s.substring(start, end);
result.append(word);
totalLen += word.length();
if (totalLen >= MAX_EXCERPT_LEN) {
break;
}
start = end;
end = wordIterator.next();
}
if (totalLen == 0) {
return null;
}
return trimEx(result.toString()) + "...";
}
Aggregations