Search in sources :

Example 1 with TermPositionVector

use of org.apache.lucene.index.TermPositionVector in project jackrabbit by apache.

the class AbstractExcerpt method createTermPositionVector.

/**
     * @param text the text.
     * @return a <code>TermPositionVector</code> for the given text.
     */
private TermPositionVector createTermPositionVector(String text) {
    // term -> TermVectorOffsetInfo[]
    final SortedMap<String, TermVectorOffsetInfo[]> termMap = new TreeMap<String, TermVectorOffsetInfo[]>();
    Reader r = new StringReader(text);
    TokenStream ts = index.getTextAnalyzer().tokenStream("", r);
    try {
        while (ts.incrementToken()) {
            OffsetAttribute offset = ts.getAttribute(OffsetAttribute.class);
            TermAttribute term = ts.getAttribute(TermAttribute.class);
            String termText = term.term();
            TermVectorOffsetInfo[] info = termMap.get(termText);
            if (info == null) {
                info = new TermVectorOffsetInfo[1];
            } else {
                TermVectorOffsetInfo[] tmp = info;
                info = new TermVectorOffsetInfo[tmp.length + 1];
                System.arraycopy(tmp, 0, info, 0, tmp.length);
            }
            info[info.length - 1] = new TermVectorOffsetInfo(offset.startOffset(), offset.endOffset());
            termMap.put(termText, info);
        }
        ts.end();
        ts.close();
    } catch (IOException e) {
    // should never happen, we are reading from a string
    }
    return new TermPositionVector() {

        private String[] terms = (String[]) termMap.keySet().toArray(new String[termMap.size()]);

        public int[] getTermPositions(int index) {
            return null;
        }

        public TermVectorOffsetInfo[] getOffsets(int index) {
            TermVectorOffsetInfo[] info = TermVectorOffsetInfo.EMPTY_OFFSET_INFO;
            if (index >= 0 && index < terms.length) {
                info = termMap.get(terms[index]);
            }
            return info;
        }

        public String getField() {
            return "";
        }

        public int size() {
            return terms.length;
        }

        public String[] getTerms() {
            return terms;
        }

        public int[] getTermFrequencies() {
            int[] freqs = new int[terms.length];
            for (int i = 0; i < terms.length; i++) {
                freqs[i] = termMap.get(terms[i]).length;
            }
            return freqs;
        }

        public int indexOf(String term) {
            int res = Arrays.binarySearch(terms, term);
            return res >= 0 ? res : -1;
        }

        public int[] indexesOf(String[] terms, int start, int len) {
            int[] res = new int[len];
            for (int i = 0; i < len; i++) {
                res[i] = indexOf(terms[i]);
            }
            return res;
        }
    };
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Reader(java.io.Reader) StringReader(java.io.StringReader) IndexReader(org.apache.lucene.index.IndexReader) IOException(java.io.IOException) TreeMap(java.util.TreeMap) TermVectorOffsetInfo(org.apache.lucene.index.TermVectorOffsetInfo) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) TermAttribute(org.apache.lucene.analysis.tokenattributes.TermAttribute) TermPositionVector(org.apache.lucene.index.TermPositionVector)

Example 2 with TermPositionVector

use of org.apache.lucene.index.TermPositionVector in project jackrabbit by apache.

the class AbstractExcerpt method getExcerpt.

/**
     * {@inheritDoc}
     */
public String getExcerpt(NodeId id, int maxFragments, int maxFragmentSize) throws IOException {
    IndexReader reader = index.getIndexReader();
    try {
        checkRewritten(reader);
        Term idTerm = TermFactory.createUUIDTerm(id.toString());
        TermDocs tDocs = reader.termDocs(idTerm);
        int docNumber;
        Document doc;
        try {
            if (tDocs.next()) {
                docNumber = tDocs.doc();
                doc = reader.document(docNumber);
            } else {
                // node not found in index
                return null;
            }
        } finally {
            tDocs.close();
        }
        Fieldable[] fields = doc.getFieldables(FieldNames.FULLTEXT);
        if (fields.length == 0) {
            log.debug("Fulltext field not stored, using {}", SimpleExcerptProvider.class.getName());
            SimpleExcerptProvider exProvider = new SimpleExcerptProvider();
            exProvider.init(query, index);
            return exProvider.getExcerpt(id, maxFragments, maxFragmentSize);
        }
        StringBuffer text = new StringBuffer();
        String separator = "";
        for (int i = 0; i < fields.length; i++) {
            if (fields[i].stringValue().length() == 0) {
                continue;
            }
            text.append(separator);
            text.append(fields[i].stringValue());
            separator = " ";
        }
        TermFreqVector tfv = reader.getTermFreqVector(docNumber, FieldNames.FULLTEXT);
        if (tfv instanceof TermPositionVector) {
            return createExcerpt((TermPositionVector) tfv, text.toString(), maxFragments, maxFragmentSize);
        } else {
            log.debug("No TermPositionVector on Fulltext field.");
            return null;
        }
    } finally {
        Util.closeOrRelease(reader);
    }
}
Also used : TermFreqVector(org.apache.lucene.index.TermFreqVector) Fieldable(org.apache.lucene.document.Fieldable) TermDocs(org.apache.lucene.index.TermDocs) IndexReader(org.apache.lucene.index.IndexReader) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) TermPositionVector(org.apache.lucene.index.TermPositionVector)

Aggregations

IndexReader (org.apache.lucene.index.IndexReader)2 TermPositionVector (org.apache.lucene.index.TermPositionVector)2 IOException (java.io.IOException)1 Reader (java.io.Reader)1 StringReader (java.io.StringReader)1 TreeMap (java.util.TreeMap)1 TokenStream (org.apache.lucene.analysis.TokenStream)1 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)1 TermAttribute (org.apache.lucene.analysis.tokenattributes.TermAttribute)1 Document (org.apache.lucene.document.Document)1 Fieldable (org.apache.lucene.document.Fieldable)1 Term (org.apache.lucene.index.Term)1 TermDocs (org.apache.lucene.index.TermDocs)1 TermFreqVector (org.apache.lucene.index.TermFreqVector)1 TermVectorOffsetInfo (org.apache.lucene.index.TermVectorOffsetInfo)1