Search in sources :

Example 1 with TermPositions

use of org.apache.lucene.index.TermPositions in project greplin-lucene-utils by Cue.

the class PhraseFilter method getDocIdSet.

@Override
public DocIdSet getDocIdSet(final IndexReader reader) throws IOException {
    List<IndexReader> subReaders = IndexReaders.gatherSubReaders(reader);
    PhraseFilterMatchList[] results = new PhraseFilterMatchList[subReaders.size()];
    int matchCount = 0;
    int readerNumber = 0;
    for (IndexReader subReader : subReaders) {
        SortedSet<TermWithFrequency> termsOrderedByFrequency = Sets.newTreeSet();
        for (int i = 0; i < this.terms.length; i++) {
            Term t = this.terms[i];
            termsOrderedByFrequency.add(new TermWithFrequency(t, subReader.docFreq(t), i));
        }
        PhraseFilterMatchList matches = null;
        TermPositions termPositions = subReader.termPositions();
        try {
            for (TermWithFrequency term : termsOrderedByFrequency) {
                if (term.docFreq == 0) {
                    break;
                }
                termPositions.seek(term.term);
                if (matches == null) {
                    // If this is the first term, collect all matches that intersect
                    // with the provided initial document set.
                    Intersection intersection = this.intersectionProvider.get(reader);
                    matches = new PhraseFilterMatchList(term.docFreq);
                    while (intersection.advanceToNextIntersection(termPositions)) {
                        int freq = termPositions.freq();
                        PhraseFilterIntList list = new PhraseFilterIntList(freq);
                        for (int i = 0; i < freq; i++) {
                            list.add(termPositions.nextPosition() - term.offset);
                        }
                        matches.add(termPositions.doc(), list);
                    }
                } else {
                    // Otherwise, intersect with the existing matches.
                    matches.intersect(termPositions, term.offset);
                }
                if (matches.getCount() == 0) {
                    break;
                }
            }
        } finally {
            termPositions.close();
        }
        if (matches != null) {
            results[readerNumber] = matches;
            matchCount += matches.getCount();
        }
        readerNumber++;
    }
    // 2^5 = 32
    final int bitsPerIntPowerLogTwo = 5;
    if (matchCount > reader.maxDoc() >> bitsPerIntPowerLogTwo) {
        FixedBitSet result = new FixedBitSet(reader.maxDoc());
        int readerOffset = 0;
        for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
            PhraseFilterMatchList matches = results[readerIndex];
            if (matches != null) {
                int count = matches.getCount();
                int[] docIds = matches.getDocIds();
                for (int i = 0; i < count; i++) {
                    result.set(docIds[i] + readerOffset);
                }
            }
            readerOffset += subReaders.get(readerIndex).maxDoc();
        }
        return result;
    } else if (matchCount == 0) {
        return DocIdSets.EMPTY;
    } else {
        int[] result = new int[matchCount];
        int base = 0;
        int readerOffset = 0;
        for (int readerIndex = 0; readerIndex < results.length; readerIndex++) {
            PhraseFilterMatchList matches = results[readerIndex];
            if (matches != null) {
                int count = matches.getCount();
                int[] docIds = matches.getDocIds();
                for (int i = 0; i < count; i++) {
                    result[base + i] = docIds[i] + readerOffset;
                }
                base += count;
            }
            readerOffset += subReaders.get(readerIndex).maxDoc();
        }
        return new SortedIntArrayDocIdSet(result);
    }
}
Also used : Intersection(com.greplin.lucene.util.Intersection) Term(org.apache.lucene.index.Term) FixedBitSet(org.apache.lucene.util.FixedBitSet) IndexReader(org.apache.lucene.index.IndexReader) TermPositions(org.apache.lucene.index.TermPositions)

Example 2 with TermPositions

use of org.apache.lucene.index.TermPositions in project greplin-lucene-utils by Cue.

the class FilteredMultiReader method termPositions.

@Override
public TermPositions termPositions(final Term term) throws IOException {
    TermPositions result = termPositions();
    result.seek(term);
    return result;
}
Also used : HackMultiTermPositions(org.apache.lucene.index.HackMultiTermPositions) TermPositions(org.apache.lucene.index.TermPositions)

Example 3 with TermPositions

use of org.apache.lucene.index.TermPositions in project zm-mailbox by Zimbra.

the class LuceneViewer method dumpTerms.

private void dumpTerms() throws IOException {
    outputBanner("Terms (in Term.compareTo() order)");
    TermEnum terms = mIndexReader.terms();
    int order = 0;
    while (terms.next()) {
        order++;
        Term term = terms.term();
        String field = term.field();
        String text = term.text();
        if (!wantThisTerm(field, text)) {
            continue;
        }
        outputLn(order + " " + field + ": " + text);
        /*
             * for each term, print the
             * <document, frequency, <position>* > tuples for a term.
             *
             * document:  document in which the Term appears
             * frequency: number of time the Term appears in the document
             * position:  position for each appearance in the document
             *
             * e.g. doc.add(new Field("field", "one two three two four five", Field.Store.YES, Field.Index.ANALYZED));
             *      then the tuple for Term("field", "two") in this document would be like:
             *      88, 2, <2, 4>
             *      where
             *      88 is the document number
             *      2  is the frequency this term appear in the document
             *      <2, 4> are the positions for each appearance in the document
             */
        // by TermPositions
        outputLn("    document, frequency, <position>*");
        // keep track of docs that appear in all terms that are filtered in.
        Set<Integer> docNums = null;
        if (hasFilters()) {
            docNums = new HashSet<Integer>();
        }
        TermPositions termPos = mIndexReader.termPositions(term);
        while (termPos.next()) {
            int docNum = termPos.doc();
            int freq = termPos.freq();
            if (docNums != null) {
                docNums.add(docNum);
            }
            output("    " + docNum + ", " + freq + ", <");
            boolean first = true;
            for (int f = 0; f < freq; f++) {
                int positionInDoc = termPos.nextPosition();
                if (!first) {
                    output(" ");
                } else {
                    first = false;
                }
                output(positionInDoc + "");
            }
            outputLn(">");
        }
        termPos.close();
        if (docNums != null) {
            computeDocsIntersection(docNums);
        }
        outputLn();
        if (order % 1000 == 0) {
            mConsole.debug("Dumped " + order + " terms");
        }
    }
    terms.close();
}
Also used : Term(org.apache.lucene.index.Term) TermEnum(org.apache.lucene.index.TermEnum) TermPositions(org.apache.lucene.index.TermPositions)

Example 4 with TermPositions

use of org.apache.lucene.index.TermPositions in project jackrabbit by apache.

the class SharedFieldCache method getValueIndex.

/**
 * Creates a <code>ValueIndex</code> for a <code>field</code> and a term
 * <code>prefix</code>. The term prefix acts as the property name for the
 * shared <code>field</code>.
 * <p>
 * This method is an adapted version of: <code>FieldCacheImpl.getStringIndex()</code>
 *
 * @param reader     the <code>IndexReader</code>.
 * @param field      name of the shared field.
 * @param prefix     the property name, will be used as term prefix.
 * @return a ValueIndex that contains the field values and order
 *         information.
 * @throws IOException if an error occurs while reading from the index.
 */
public ValueIndex getValueIndex(IndexReader reader, String field, String prefix) throws IOException {
    if (reader instanceof ReadOnlyIndexReader) {
        reader = ((ReadOnlyIndexReader) reader).getBase();
    }
    field = field.intern();
    ValueIndex ret = lookup(reader, field, prefix);
    if (ret == null) {
        final int maxDocs = reader.maxDoc();
        Comparable<?>[] retArray = new Comparable<?>[maxDocs];
        Map<Integer, Integer> positions = new HashMap<Integer, Integer>();
        boolean usingSimpleComparable = true;
        int setValues = 0;
        if (maxDocs > 0) {
            IndexFormatVersion version = IndexFormatVersion.getVersion(reader);
            boolean hasPayloads = version.isAtLeast(IndexFormatVersion.V3);
            TermDocs termDocs;
            byte[] payload = null;
            int type;
            if (hasPayloads) {
                termDocs = reader.termPositions();
                payload = new byte[1];
            } else {
                termDocs = reader.termDocs();
            }
            TermEnum termEnum = reader.terms(new Term(field, prefix));
            try {
                if (termEnum.term() == null) {
                    throw new RuntimeException("no terms in field " + field);
                }
                do {
                    Term term = termEnum.term();
                    if (term.field() != field || !term.text().startsWith(prefix)) {
                        break;
                    }
                    final String value = termValueAsString(term, prefix);
                    termDocs.seek(term);
                    while (termDocs.next()) {
                        int termPosition = 0;
                        type = PropertyType.UNDEFINED;
                        if (hasPayloads) {
                            TermPositions termPos = (TermPositions) termDocs;
                            termPosition = termPos.nextPosition();
                            if (termPos.isPayloadAvailable()) {
                                payload = termPos.getPayload(payload, 0);
                                type = PropertyMetaData.fromByteArray(payload).getPropertyType();
                            }
                        }
                        setValues++;
                        Comparable<?> v = getValue(value, type);
                        int doc = termDocs.doc();
                        Comparable<?> ca = retArray[doc];
                        if (ca == null) {
                            if (usingSimpleComparable) {
                                // put simple value on the queue
                                positions.put(doc, termPosition);
                                retArray[doc] = v;
                            } else {
                                retArray[doc] = new ComparableArray(v, termPosition);
                            }
                        } else {
                            if (ca instanceof ComparableArray) {
                                ((ComparableArray) ca).insert(v, termPosition);
                            } else {
                                // Comparable to ComparableArray
                                for (int pos : positions.keySet()) {
                                    retArray[pos] = new ComparableArray(retArray[pos], positions.get(pos));
                                }
                                positions = null;
                                usingSimpleComparable = false;
                                ComparableArray caNew = (ComparableArray) retArray[doc];
                                retArray[doc] = caNew.insert(v, termPosition);
                            }
                        }
                    }
                } while (termEnum.next());
            } finally {
                termDocs.close();
                termEnum.close();
            }
        }
        ValueIndex value = new ValueIndex(retArray, setValues);
        store(reader, field, prefix, value);
        return value;
    }
    return ret;
}
Also used : HashMap(java.util.HashMap) WeakHashMap(java.util.WeakHashMap) TermDocs(org.apache.lucene.index.TermDocs) Term(org.apache.lucene.index.Term) TermEnum(org.apache.lucene.index.TermEnum) TermPositions(org.apache.lucene.index.TermPositions)

Aggregations

TermPositions (org.apache.lucene.index.TermPositions)4 Term (org.apache.lucene.index.Term)3 TermEnum (org.apache.lucene.index.TermEnum)2 Intersection (com.greplin.lucene.util.Intersection)1 HashMap (java.util.HashMap)1 WeakHashMap (java.util.WeakHashMap)1 HackMultiTermPositions (org.apache.lucene.index.HackMultiTermPositions)1 IndexReader (org.apache.lucene.index.IndexReader)1 TermDocs (org.apache.lucene.index.TermDocs)1 FixedBitSet (org.apache.lucene.util.FixedBitSet)1