Search in sources :

Example 1 with TermVectorOffsetInfo

use of org.apache.lucene.index.TermVectorOffsetInfo in project jackrabbit by apache.

the class AbstractExcerpt method createTermPositionVector.

/**
     * @param text the text.
     * @return a <code>TermPositionVector</code> for the given text.
     */
private TermPositionVector createTermPositionVector(String text) {
    // term -> TermVectorOffsetInfo[]
    final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
    Reader r = new StringReader(text);
    TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
    try {
        while (ts.incrementToken()) {
            OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
            TermAttribute term = ts.getAttribute(TermAttribute.class);
            String termText = term.term();
            TermVectorOffsetInfo[] info = termMap.get(termText);
            if (info == null) {
                info = new TermVectorOffsetInfo[1];
            } else {
                TermVectorOffsetInfo[] tmp = info;
                info = new TermVectorOffsetInfo[tmp.length + 1];
                System.arraycopy(tmp, 0, info, 0, tmp.length);
            }
            info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
            termMap.put(termText, info);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
    // should never happen, we are reading from a string
    }
    return new TermPositionVector() {

        private String[] terms = (String[]) termMap.keySet().toArray(new String[termMap.size()]);

        public int[] getTermPositions(int index) {
            return null;
        }

        public TermVectorOffsetInfo[] getOffsets(int index) {
            TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
            if (index >= 0 && index < terms.length) {
                info = termMap.get(terms[index]);
            }
            return info;
        }

        public String getField() {
            return "";
        }

        public int size() {
            return terms.length;
        }

        public String[] getTerms() {
            return terms;
        }

        public int[] getTermFrequencies() {
            int[] freqs = new int[terms.length];
            for (int i = 0; i < terms.length; i++) {
                freqs[i] = termMap.get(terms[i]).length;
            }
            return freqs;
        }

        public int indexOf(String term) {
            int res = Arrays.binarySearch(terms, term);
            return res >= 0 ? res : -1;
        }

        public int[] indexesOf(String[] terms, int start, int len) {
            int[] res = new int[len];
            for (int i = 0; i < len; i++) {
                res[i] = indexOf(terms[i]);
            }
            return res;
        }
    };
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Reader(java.io.Reader) StringReader(java.io.StringReader) IndexReader(org.apache.lucene.index.IndexReader) IOException(java.io.IOException) TreeMap(java.util.TreeMap) TermVectorOffsetInfo(org.apache.lucene.index.TermVectorOffsetInfo) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) TermAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute) TermPositionVector(org.apache.lucene.index.TermPositionVector)

Example 2 with TermVectorOffsetInfo

use of org.apache.lucene.index.TermVectorOffsetInfo in project jackrabbit by apache.

the class DefaultHighlighter method mergeFragments.

protected String mergeFragments(TermVectorOffsetInfo[] offsets, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException {
    if (offsets == null || offsets.length == 0) {
        // nothing to highlight
        return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2);
    }
    // Math.min(10, offsets.length); // 10 terms is plenty?
    int lastOffset = offsets.length;
    List<FragmentInfo> fragmentInfoList = new ArrayList<FragmentInfo>();
    if (offsets[0].getEndOffset() <= text.length()) {
        FragmentInfo fi = new FragmentInfo(offsets[0], surround * 2);
        for (int i = 1; i < lastOffset; i++) {
            if (offsets[i].getEndOffset() > text.length()) {
                break;
            }
            if (fi.add(offsets[i])) {
                continue;
            }
            fragmentInfoList.add(fi);
            fi = new FragmentInfo(offsets[i], surround * 2);
        }
        fragmentInfoList.add(fi);
    }
    if (fragmentInfoList.isEmpty()) {
        // nothing to highlight
        return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2);
    }
    // sort with score
    Collections.sort(fragmentInfoList, new FragmentInfoScoreSorter());
    // extract best fragments
    List<FragmentInfo> bestFragmentsList = new ArrayList<FragmentInfo>();
    for (int i = 0; i < Math.min(fragmentInfoList.size(), maxFragments); i++) {
        bestFragmentsList.add(fragmentInfoList.get(i));
    }
    // re-sort with positions
    Collections.sort(bestFragmentsList, new FragmentInfoPositionSorter());
    // merge #maxFragments fragments
    StringReader reader = new StringReader(text);
    StringBuffer sb = new StringBuffer(excerptStart);
    int pos = 0;
    char[] cbuf;
    int skip;
    int nextStart;
    int skippedChars;
    int firstWhitespace;
    for (int i = 0; i < bestFragmentsList.size(); i++) {
        FragmentInfo fi = bestFragmentsList.get(i);
        fi.trim();
        nextStart = fi.getStartOffset();
        skip = nextStart - pos;
        if (skip > surround * 2) {
            skip -= surround;
            if (i > 0) {
                // end last fragment
                cbuf = new char[surround];
                reader.read(cbuf, 0, surround);
                // find last whitespace
                skippedChars = 1;
                for (; skippedChars < surround + 1; skippedChars++) {
                    if (Character.isWhitespace(cbuf[surround - skippedChars])) {
                        break;
                    }
                }
                pos += surround;
                if (skippedChars > surround) {
                    skippedChars = surround;
                }
                sb.append(escape(new String(cbuf, 0, surround - skippedChars)));
                sb.append(fragmentEnd);
            }
        }
        if (skip >= surround) {
            if (i > 0) {
                skip -= surround;
            }
            // skip
            reader.skip((long) skip);
            pos += skip;
        }
        // start fragment
        cbuf = new char[nextStart - pos];
        skippedChars = Math.max(cbuf.length - 1, 0);
        firstWhitespace = skippedChars;
        reader.read(cbuf, 0, nextStart - pos);
        pos += (nextStart - pos);
        sb.append(fragmentStart);
        // find last period followed by whitespace
        if (cbuf.length > 0) {
            for (; skippedChars >= 0; skippedChars--) {
                if (Character.isWhitespace(cbuf[skippedChars])) {
                    firstWhitespace = skippedChars;
                    if (skippedChars - 1 >= 0 && cbuf[skippedChars - 1] == '.') {
                        skippedChars++;
                        break;
                    }
                }
            }
        }
        boolean sentenceStart = true;
        if (skippedChars == -1) {
            if (pos == cbuf.length) {
                // this fragment is the start of the text -> skip none
                skippedChars = 0;
            } else {
                sentenceStart = false;
                skippedChars = firstWhitespace + 1;
            }
        }
        if (!sentenceStart) {
            sb.append("... ");
        }
        sb.append(escape(new String(cbuf, skippedChars, cbuf.length - skippedChars)));
        // iterate terms
        for (Iterator<TermVectorOffsetInfo> iter = fi.iterator(); iter.hasNext(); ) {
            TermVectorOffsetInfo ti = iter.next();
            nextStart = ti.getStartOffset();
            if (nextStart - pos > 0) {
                cbuf = new char[nextStart - pos];
                int charsRead = reader.read(cbuf, 0, nextStart - pos);
                pos += (nextStart - pos);
                sb.append(escape(new String(cbuf, 0, charsRead)));
            }
            sb.append(hlStart);
            nextStart = ti.getEndOffset();
            // print term
            cbuf = new char[nextStart - pos];
            reader.read(cbuf, 0, nextStart - pos);
            pos += (nextStart - pos);
            sb.append(escape(new String(cbuf)));
            sb.append(hlEnd);
        }
    }
    if (pos != 0) {
        // end fragment
        if (offsets.length > lastOffset) {
            surround = Math.min(offsets[lastOffset].getStartOffset() - pos, surround);
        }
        cbuf = new char[surround];
        skip = reader.read(cbuf, 0, surround);
        boolean EOF = reader.read() == -1;
        if (skip >= 0) {
            if (!EOF) {
                skippedChars = 1;
                for (; skippedChars < surround + 1; skippedChars++) {
                    if (Character.isWhitespace(cbuf[surround - skippedChars])) {
                        break;
                    }
                }
                if (skippedChars > surround) {
                    skippedChars = surround;
                }
            } else {
                skippedChars = 0;
            }
            sb.append(escape(new String(cbuf, 0, EOF ? skip : (surround - skippedChars))));
            if (!EOF) {
                char lastChar = sb.charAt(sb.length() - 1);
                if (lastChar != '.' && lastChar != '!' && lastChar != '?') {
                    sb.append(" ...");
                }
            }
        }
        sb.append(fragmentEnd);
    }
    sb.append(excerptEnd);
    return sb.toString();
}
Also used : ArrayList(java.util.ArrayList) TermVectorOffsetInfo(org.apache.lucene.index.TermVectorOffsetInfo) StringReader(java.io.StringReader)

Example 3 with TermVectorOffsetInfo

use of org.apache.lucene.index.TermVectorOffsetInfo in project jackrabbit by apache.

the class DefaultHighlighter method doHighlight.

/**
     * @see #highlight(TermPositionVector, Set, String, String, String, String, String, String, String, int, int)
     */
protected String doHighlight(TermPositionVector tvec, Set<Term[]> queryTerms, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException {
    List<TermVectorOffsetInfo> termOffsetInfo = new ArrayList<TermVectorOffsetInfo>();
    Iterator<Term[]> it = queryTerms.iterator();
    while (it.hasNext()) {
        Term[] qt = it.next();
        if (qt == null) {
            continue;
        }
        final int qtLen = qt.length;
        if (qtLen == 0) {
            continue;
        }
        String[] qtText = new String[qtLen];
        for (int i = 0; i < qtLen; i++) {
            qtText[i] = qt[i].text();
        }
        int[] tvecindexes = tvec.indexesOf(qtText, 0, qtText.length);
        Map<Integer, TermVectorOffsetInfo[]> localTermOffsetInfo = new HashMap<Integer, TermVectorOffsetInfo[]>();
        for (int tvecindex : tvecindexes) {
            TermVectorOffsetInfo[] termoffsets = tvec.getOffsets(tvecindex);
            if (termoffsets == null || termoffsets.length == 0) {
                continue;
            }
            localTermOffsetInfo.put(tvecindex, termoffsets);
        }
        // if the first one is there
        if (tvecindexes.length > 0 && tvecindexes[0] >= 0) {
            // we have to build one interval TermVectorOffsetInfo for each
            // hit;
            List<TermVectorOffsetInfo> intervalTermOffsetInfo = new ArrayList<TermVectorOffsetInfo>();
            // pick all the first key's hist as interval start
            TermVectorOffsetInfo[] firstKeyTermOffsets = localTermOffsetInfo.get(tvecindexes[0]);
            Arrays.sort(firstKeyTermOffsets, new TermVectorOffsetInfoSorter());
            intervalTermOffsetInfo.addAll(Arrays.asList(firstKeyTermOffsets));
            // dropped from the list
            for (int i = 1; i < tvecindexes.length; i++) {
                final Integer key = tvecindexes[i];
                TermVectorOffsetInfo[] termoffsets = localTermOffsetInfo.get(key);
                if (termoffsets == null) {
                    continue;
                }
                Arrays.sort(termoffsets, new TermVectorOffsetInfoSorter());
                Iterator<TermVectorOffsetInfo> intervalIterator = intervalTermOffsetInfo.iterator();
                int index = 0;
                while (intervalIterator.hasNext()) {
                    TermVectorOffsetInfo intervalOI = intervalIterator.next();
                    if (index >= termoffsets.length) {
                        intervalIterator.remove();
                        continue;
                    }
                    boolean matchSearch = true;
                    boolean matchFound = false;
                    while (matchSearch) {
                        TermVectorOffsetInfo localOI = termoffsets[index];
                        // check interval match
                        // CJK languages will have the tokens from the PhraseQuery glued together (see LUCENE-2458)
                        int diff = localOI.getStartOffset() - intervalOI.getEndOffset();
                        // after upgrading to lucene 3.1
                        if (diff == 1 || diff == 0) {
                            intervalOI.setEndOffset(localOI.getEndOffset());
                            matchSearch = false;
                            matchFound = true;
                        }
                        index++;
                        if (index >= termoffsets.length) {
                            matchSearch = false;
                        }
                    }
                    if (!matchFound) {
                        index--;
                        intervalIterator.remove();
                    }
                }
            }
            termOffsetInfo.addAll(intervalTermOffsetInfo);
        }
    }
    TermVectorOffsetInfo[] offsets = termOffsetInfo.toArray(new TermVectorOffsetInfo[termOffsetInfo.size()]);
    // sort offsets
    if (offsets != null && offsets.length > 1) {
        Arrays.sort(offsets, new TermVectorOffsetInfoSorter());
    }
    return mergeFragments(offsets, text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, hlStart, hlEnd, maxFragments, surround);
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) TermVectorOffsetInfo(org.apache.lucene.index.TermVectorOffsetInfo)

Example 4 with TermVectorOffsetInfo

use of org.apache.lucene.index.TermVectorOffsetInfo in project jackrabbit by apache.

the class WeightedHighlighter method mergeFragments.

@Override
protected String mergeFragments(TermVectorOffsetInfo[] offsets, String text, String excerptStart, String excerptEnd, String fragmentStart, String fragmentEnd, String hlStart, String hlEnd, int maxFragments, int surround) throws IOException {
    if (offsets == null || offsets.length == 0) {
        // nothing to highlight
        return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2);
    }
    PriorityQueue<FragmentInfo> bestFragments = new FragmentInfoPriorityQueue(maxFragments);
    for (int i = 0; i < offsets.length; i++) {
        if (offsets[i].getEndOffset() <= text.length()) {
            FragmentInfo fi = new FragmentInfo(offsets[i], surround * 2);
            for (int j = i + 1; j < offsets.length; j++) {
                if (offsets[j].getEndOffset() > text.length()) {
                    break;
                }
                if (!fi.add(offsets[j], text)) {
                    break;
                }
            }
            bestFragments.insertWithOverflow(fi);
        }
    }
    if (bestFragments.size() == 0) {
        return createDefaultExcerpt(text, excerptStart, excerptEnd, fragmentStart, fragmentEnd, surround * 2);
    }
    // retrieve fragment infos from queue and fill into list, least
    // fragment comes out first
    List<FragmentInfo> infos = new LinkedList<FragmentInfo>();
    while (bestFragments.size() > 0) {
        FragmentInfo fi = (FragmentInfo) bestFragments.pop();
        infos.add(0, fi);
    }
    Map<TermVectorOffsetInfo, Object> offsetInfos = new IdentityHashMap<TermVectorOffsetInfo, Object>();
    // remove overlapping fragment infos
    Iterator<FragmentInfo> it = infos.iterator();
    while (it.hasNext()) {
        FragmentInfo fi = it.next();
        boolean overlap = false;
        Iterator<TermVectorOffsetInfo> fit = fi.iterator();
        while (fit.hasNext() && !overlap) {
            TermVectorOffsetInfo oi = fit.next();
            if (offsetInfos.containsKey(oi)) {
                overlap = true;
            }
        }
        if (overlap) {
            it.remove();
        } else {
            Iterator<TermVectorOffsetInfo> oit = fi.iterator();
            while (oit.hasNext()) {
                offsetInfos.put(oit.next(), null);
            }
        }
    }
    // create excerpts
    StringBuffer sb = new StringBuffer(excerptStart);
    it = infos.iterator();
    while (it.hasNext()) {
        FragmentInfo fi = it.next();
        sb.append(fragmentStart);
        int limit = Math.max(0, fi.getStartOffset() / 2 + fi.getEndOffset() / 2 - surround);
        int len = startFragment(sb, text, fi.getStartOffset(), limit);
        TermVectorOffsetInfo lastOffsetInfo = null;
        Iterator<TermVectorOffsetInfo> fIt = fi.iterator();
        while (fIt.hasNext()) {
            TermVectorOffsetInfo oi = fIt.next();
            if (lastOffsetInfo != null) {
                // fill in text between terms
                sb.append(escape(text.substring(lastOffsetInfo.getEndOffset(), oi.getStartOffset())));
            }
            sb.append(hlStart);
            sb.append(escape(text.substring(oi.getStartOffset(), oi.getEndOffset())));
            sb.append(hlEnd);
            lastOffsetInfo = oi;
        }
        limit = Math.min(text.length(), fi.getStartOffset() - len + (surround * 2));
        endFragment(sb, text, fi.getEndOffset(), limit);
        sb.append(fragmentEnd);
    }
    sb.append(excerptEnd);
    return sb.toString();
}
Also used : IdentityHashMap(java.util.IdentityHashMap) LinkedList(java.util.LinkedList) TermVectorOffsetInfo(org.apache.lucene.index.TermVectorOffsetInfo)

Aggregations

TermVectorOffsetInfo (org.apache.lucene.index.TermVectorOffsetInfo)4 StringReader (java.io.StringReader)2 ArrayList (java.util.ArrayList)2 IOException (java.io.IOException)1 Reader (java.io.Reader)1 HashMap (java.util.HashMap)1 IdentityHashMap (java.util.IdentityHashMap)1 LinkedList (java.util.LinkedList)1 TreeMap (java.util.TreeMap)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)1 TermAttribute (org.apache.lucene.analysis.tokenattributes.TermAttribute)1 IndexReader (org.apache.lucene.index.IndexReader)1 Term (org.apache.lucene.index.Term)1 TermPositionVector (org.apache.lucene.index.TermPositionVector)1