use of org.apache.lucene.index.TermVectorOffsetInfo in project jackrabbit by apache.
the class AbstractExcerpt method createTermPositionVector.
/**
* @param text the text.
* @return a <code>TermPositionVector</code> for the given text.
*/
private TermPositionVector createTermPositionVector(String text) {
// term -> TermVectorOffsetInfo[]
final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
Reader r = new StringReader(text);
TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
try {
while (ts.incrementToken()) {
OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
TermAttribute term = ts.getAttribute(TermAttribute.class);
String termText = term.term();
TermVectorOffsetInfo[] info = termMap.get(termText);
if (info == null) {
info = new TermVectorOffsetInfo[1];
} else {
TermVectorOffsetInfo[] tmp = info;
info = new TermVectorOffsetInfo[tmp.length + 1];
System.arraycopy(tmp, 0, info, 0, tmp.length);
}
info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
termMap.put(termText, info);
}
ts.end();
ts.close();
} catch (IOException e) {
// should never happen, we are reading from a string
}
return new TermPositionVector() {
private String[] terms = (String[]) termMap.keySet().toArray(new String[termMap.size()]);
public int[] getTermPositions(int index) {
return null;
}
public TermVectorOffsetInfo[] getOffsets(int index) {
TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
if (index >= 0 && index < terms.length) {
info = termMap.get(terms[index]);
}
return info;
}
public String getField() {
return "";
}
public int size() {
return terms.length;
}
public String[] getTerms() {
return terms;
}
public int[] getTermFrequencies() {
int[] freqs = new int[terms.length];
for (int i = 0; i < terms.length; i++) {
freqs[i] = termMap.get(terms[i]).length;
}
return freqs;
}
public int indexOf(String term) {
int res = Arrays.binarySearch(terms, term);
return res >= 0 ? res : -1;
}
public int[] indexesOf(String[] terms, int start, int len) {
int[] res = new int[len];
for (int i = 0; i < len; i++) {
res[i] = indexOf(terms[i]);
}
return res;
}
};
}
use of org.apache.lucene.index.TermVectorOffsetInfo in project jackrabbit by apache.
the class DefaultHighlighter method mergeFragments.
protected String mergeFragments(TermVectorOffsetInfo[] offsets, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException {
if (offsets == null || offsets.length == 0) {
// nothing to highlight
return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2);
}
// Math.min(10, offsets.length); // 10 terms is plenty?
int lastOffset = offsets.length;
List<FragmentInfo> fragmentInfoList = new ArrayList<FragmentInfo>();
if (offsets[0].getEndOffset() <= text.length()) {
FragmentInfo fi = new FragmentInfo(offsets[0], surround * 2);
for (int i = 1; i < lastOffset; i++) {
if (offsets[i].getEndOffset() > text.length()) {
break;
}
if (fi.add(offsets[i])) {
continue;
}
fragmentInfoList.add(fi);
fi = new FragmentInfo(offsets[i], surround * 2);
}
fragmentInfoList.add(fi);
}
if (fragmentInfoList.isEmpty()) {
// nothing to highlight
return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2);
}
// sort with score
Collections.sort(fragmentInfoList, new FragmentInfoScoreSorter());
// extract best fragments
List<FragmentInfo> bestFragmentsList = new ArrayList<FragmentInfo>();
for (int i = 0; i < Math.min(fragmentInfoList.size(), maxFragments); i++) {
bestFragmentsList.add(fragmentInfoList.get(i));
}
// re-sort with positions
Collections.sort(bestFragmentsList, new FragmentInfoPositionSorter());
// merge #maxFragments fragments
StringReader reader = new StringReader(text);
StringBuffer sb = new StringBuffer(excerptStart);
int pos = 0;
char[] cbuf;
int skip;
int nextStart;
int skippedChars;
int firstWhitespace;
for (int i = 0; i < bestFragmentsList.size(); i++) {
FragmentInfo fi = bestFragmentsList.get(i);
fi.trim();
nextStart = fi.getStartOffset();
skip = nextStart - pos;
if (skip > surround * 2) {
skip -= surround;
if (i > 0) {
// end last fragment
cbuf = new char[surround];
reader.read(cbuf, 0, surround);
// find last whitespace
skippedChars = 1;
for (; skippedChars < surround + 1; skippedChars++) {
if (Character.isWhitespace(cbuf[surround - skippedChars])) {
break;
}
}
pos += surround;
if (skippedChars > surround) {
skippedChars = surround;
}
sb.append(escape(new String(cbuf, 0, surround - skippedChars)));
sb.append(fragmentEnd);
}
}
if (skip >= surround) {
if (i > 0) {
skip -= surround;
}
// skip
reader.skip((long) skip);
pos += skip;
}
// start fragment
cbuf = new char[nextStart - pos];
skippedChars = Math.max(cbuf.length - 1, 0);
firstWhitespace = skippedChars;
reader.read(cbuf, 0, nextStart - pos);
pos += (nextStart - pos);
sb.append(fragmentStart);
// find last period followed by whitespace
if (cbuf.length > 0) {
for (; skippedChars >= 0; skippedChars--) {
if (Character.isWhitespace(cbuf[skippedChars])) {
firstWhitespace = skippedChars;
if (skippedChars - 1 >= 0 && cbuf[skippedChars - 1] == '.') {
skippedChars++;
break;
}
}
}
}
boolean sentenceStart = true;
if (skippedChars == -1) {
if (pos == cbuf.length) {
// this fragment is the start of the text -> skip none
skippedChars = 0;
} else {
sentenceStart = false;
skippedChars = firstWhitespace + 1;
}
}
if (!sentenceStart) {
sb.append("... ");
}
sb.append(escape(new String(cbuf, skippedChars, cbuf.length - skippedChars)));
// iterate terms
for (Iterator<TermVectorOffsetInfo> iter = fi.iterator(); iter.hasNext(); ) {
TermVectorOffsetInfo ti = iter.next();
nextStart = ti.getStartOffset();
if (nextStart - pos > 0) {
cbuf = new char[nextStart - pos];
int charsRead = reader.read(cbuf, 0, nextStart - pos);
pos += (nextStart - pos);
sb.append(escape(new String(cbuf, 0, charsRead)));
}
sb.append(hlStart);
nextStart = ti.getEndOffset();
// print term
cbuf = new char[nextStart - pos];
reader.read(cbuf, 0, nextStart - pos);
pos += (nextStart - pos);
sb.append(escape(new String(cbuf)));
sb.append(hlEnd);
}
}
if (pos != 0) {
// end fragment
if (offsets.length > lastOffset) {
surround = Math.min(offsets[lastOffset].getStartOffset() - pos, surround);
}
cbuf = new char[surround];
skip = reader.read(cbuf, 0, surround);
boolean EOF = reader.read() == -1;
if (skip >= 0) {
if (!EOF) {
skippedChars = 1;
for (; skippedChars < surround + 1; skippedChars++) {
if (Character.isWhitespace(cbuf[surround - skippedChars])) {
break;
}
}
if (skippedChars > surround) {
skippedChars = surround;
}
} else {
skippedChars = 0;
}
sb.append(escape(new String(cbuf, 0, EOF ? skip : (surround - skippedChars))));
if (!EOF) {
char lastChar = sb.charAt(sb.length() - 1);
if (lastChar != '.' && lastChar != '!' && lastChar != '?') {
sb.append(" ...");
}
}
}
sb.append(fragmentEnd);
}
sb.append(excerptEnd);
return sb.toString();
}
use of org.apache.lucene.index.TermVectorOffsetInfo in project jackrabbit by apache.
the class DefaultHighlighter method doHighlight.
/**
* @see #highlight(TermPositionVector, Set, String, String, String, String, String, String, String, int, int)
*/
protected String doHighlight(TermPositionVector tvec, Set<Term[]> queryTerms, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException {
List<TermVectorOffsetInfo> termOffsetInfo = new ArrayList<TermVectorOffsetInfo>();
Iterator<Term[]> it = queryTerms.iterator();
while (it.hasNext()) {
Term[] qt = it.next();
if (qt == null) {
continue;
}
final int qtLen = qt.length;
if (qtLen == 0) {
continue;
}
String[] qtText = new String[qtLen];
for (int i = 0; i < qtLen; i++) {
qtText[i] = qt[i].text();
}
int[] tvecindexes = tvec.indexesOf(qtText, 0, qtText.length);
Map<Integer, TermVectorOffsetInfo[]> localTermOffsetInfo = new HashMap<Integer, TermVectorOffsetInfo[]>();
for (int tvecindex : tvecindexes) {
TermVectorOffsetInfo[] termoffsets = tvec.getOffsets(tvecindex);
if (termoffsets == null || termoffsets.length == 0) {
continue;
}
localTermOffsetInfo.put(tvecindex, termoffsets);
}
// if the first one is there
if (tvecindexes.length > 0 && tvecindexes[0] >= 0) {
// we have to build one interval TermVectorOffsetInfo for each
// hit;
List<TermVectorOffsetInfo> intervalTermOffsetInfo = new ArrayList<TermVectorOffsetInfo>();
// pick all the first key's hist as interval start
TermVectorOffsetInfo[] firstKeyTermOffsets = localTermOffsetInfo.get(tvecindexes[0]);
Arrays.sort(firstKeyTermOffsets, new TermVectorOffsetInfoSorter());
intervalTermOffsetInfo.addAll(Arrays.asList(firstKeyTermOffsets));
// dropped from the list
for (int i = 1; i < tvecindexes.length; i++) {
final Integer key = tvecindexes[i];
TermVectorOffsetInfo[] termoffsets = localTermOffsetInfo.get(key);
if (termoffsets == null) {
continue;
}
Arrays.sort(termoffsets, new TermVectorOffsetInfoSorter());
Iterator<TermVectorOffsetInfo> intervalIterator = intervalTermOffsetInfo.iterator();
int index = 0;
while (intervalIterator.hasNext()) {
TermVectorOffsetInfo intervalOI = intervalIterator.next();
if (index >= termoffsets.length) {
intervalIterator.remove();
continue;
}
boolean matchSearch = true;
boolean matchFound = false;
while (matchSearch) {
TermVectorOffsetInfo localOI = termoffsets[index];
// check interval match
// CJK languages will have the tokens from the PhraseQuery glued together (see LUCENE-2458)
int diff = localOI.getStartOffset() - intervalOI.getEndOffset();
// after upgrading to lucene 3.1
if (diff == 1 || diff == 0) {
intervalOI.setEndOffset(localOI.getEndOffset());
matchSearch = false;
matchFound = true;
}
index++;
if (index >= termoffsets.length) {
matchSearch = false;
}
}
if (!matchFound) {
index--;
intervalIterator.remove();
}
}
}
termOffsetInfo.addAll(intervalTermOffsetInfo);
}
}
TermVectorOffsetInfo[] offsets = termOffsetInfo.toArray(new TermVectorOffsetInfo[termOffsetInfo.size()]);
// sort offsets
if (offsets != null && offsets.length > 1) {
Arrays.sort(offsets, new TermVectorOffsetInfoSorter());
}
return mergeFragments(offsets, text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, hlStart, hlEnd, maxFragments, surround);
}
use of org.apache.lucene.index.TermVectorOffsetInfo in project jackrabbit by apache.
the class WeightedHighlighter method mergeFragments.
@Override
protected String mergeFragments(TermVectorOffsetInfo[] offsets, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException {
if (offsets == null || offsets.length == 0) {
// nothing to highlight
return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2);
}
PriorityQueue<FragmentInfo> bestFragments = new FragmentInfoPriorityQueue(maxFragments);
for (int i = 0; i < offsets.length; i++) {
if (offsets[i].getEndOffset() <= text.length()) {
FragmentInfo fi = new FragmentInfo(offsets[i], surround * 2);
for (int j = i + 1; j < offsets.length; j++) {
if (offsets[j].getEndOffset() > text.length()) {
break;
}
if (!fi.add(offsets[j], text)) {
break;
}
}
bestFragments.insertWithOverflow(fi);
}
}
if (bestFragments.size() == 0) {
return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2);
}
// retrieve fragment infos from queue and fill into list, least
// fragment comes out first
List<FragmentInfo> infos = new LinkedList<FragmentInfo>();
while (bestFragments.size() > 0) {
FragmentInfo fi = (FragmentInfo) bestFragments.pop();
infos.add(0, fi);
}
Map<TermVectorOffsetInfo, Object> offsetInfos = new IdentityHashMap<TermVectorOffsetInfo, Object>();
// remove overlapping fragment infos
Iterator<FragmentInfo> it = infos.iterator();
while (it.hasNext()) {
FragmentInfo fi = it.next();
boolean overlap = false;
Iterator<TermVectorOffsetInfo> fit = fi.iterator();
while (fit.hasNext() && !overlap) {
TermVectorOffsetInfo oi = fit.next();
if (offsetInfos.containsKey(oi)) {
overlap = true;
}
}
if (overlap) {
it.remove();
} else {
Iterator<TermVectorOffsetInfo> oit = fi.iterator();
while (oit.hasNext()) {
offsetInfos.put(oit.next(), null);
}
}
}
// create excerpts
StringBuffer sb = new StringBuffer(excerptStart);
it = infos.iterator();
while (it.hasNext()) {
FragmentInfo fi = it.next();
sb.append(fragmentStart);
int limit = Math.max(0, fi.getStartOffset() / 2 + fi.getEndOffset() / 2 - surround);
int len = startFragment(sb, text, fi.getStartOffset(), limit);
TermVectorOffsetInfo lastOffsetInfo = null;
Iterator<TermVectorOffsetInfo> fIt = fi.iterator();
while (fIt.hasNext()) {
TermVectorOffsetInfo oi = fIt.next();
if (lastOffsetInfo != null) {
// fill in text between terms
sb.append(escape(text.substring(lastOffsetInfo.getEndOffset(), oi.getStartOffset())));
}
sb.append(hlStart);
sb.append(escape(text.substring(oi.getStartOffset(), oi.getEndOffset())));
sb.append(hlEnd);
lastOffsetInfo = oi;
}
limit = Math.min(text.length(), fi.getStartOffset() - len + (surround * 2));
endFragment(sb, text, fi.getEndOffset(), limit);
sb.append(fragmentEnd);
}
sb.append(excerptEnd);
return sb.toString();
}
Aggregations