Search in sources :

Example 61 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.

the class OrdinalsBuilder method buildFromTerms.

/**
     * This method iterates all terms in the given {@link TermsEnum} and
     * associates each terms ordinal with the terms documents. The caller must
     * exhaust the returned {@link BytesRefIterator} which returns all values
     * where the first returned value is associated with the ordinal <tt>1</tt>
     * etc.
     * <p>
     * If the {@link TermsEnum} contains prefix coded numerical values the terms
     * enum should be wrapped with either {@link #wrapNumeric32Bit(TermsEnum)}
     * or {@link #wrapNumeric64Bit(TermsEnum)} depending on its precision. If
     * the {@link TermsEnum} is not wrapped the returned
     * {@link BytesRefIterator} will contain partial precision terms rather than
     * only full-precision terms.
     * </p>
     */
public BytesRefIterator buildFromTerms(final TermsEnum termsEnum) throws IOException {
    return new BytesRefIterator() {

        private PostingsEnum docsEnum = null;

        @Override
        public BytesRef next() throws IOException {
            BytesRef ref;
            if ((ref = termsEnum.next()) != null) {
                docsEnum = termsEnum.postings(docsEnum, PostingsEnum.NONE);
                nextOrdinal();
                int docId;
                while ((docId = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    addDoc(docId);
                }
            }
            return ref;
        }
    };
}
Also used : BytesRefIterator(org.apache.lucene.util.BytesRefIterator) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Example 62 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TermVectorsWriter method addAllDocVectors.

/** Safe (but, slowish) default method to write every
   *  vector field in the document. */
protected final void addAllDocVectors(Fields vectors, MergeState mergeState) throws IOException {
    if (vectors == null) {
        startDocument(0);
        finishDocument();
        return;
    }
    int numFields = vectors.size();
    if (numFields == -1) {
        // count manually! TODO: Maybe enforce that Fields.size() returns something valid?
        numFields = 0;
        for (final Iterator<String> it = vectors.iterator(); it.hasNext(); ) {
            it.next();
            numFields++;
        }
    }
    startDocument(numFields);
    String lastFieldName = null;
    TermsEnum termsEnum = null;
    PostingsEnum docsAndPositionsEnum = null;
    int fieldCount = 0;
    for (String fieldName : vectors) {
        fieldCount++;
        final FieldInfo fieldInfo = mergeState.mergeFieldInfos.fieldInfo(fieldName);
        assert lastFieldName == null || fieldName.compareTo(lastFieldName) > 0 : "lastFieldName=" + lastFieldName + " fieldName=" + fieldName;
        lastFieldName = fieldName;
        final Terms terms = vectors.terms(fieldName);
        if (terms == null) {
            // FieldsEnum shouldn't lie...
            continue;
        }
        final boolean hasPositions = terms.hasPositions();
        final boolean hasOffsets = terms.hasOffsets();
        final boolean hasPayloads = terms.hasPayloads();
        assert !hasPayloads || hasPositions;
        int numTerms = (int) terms.size();
        if (numTerms == -1) {
            // count manually. It is stupid, but needed, as Terms.size() is not a mandatory statistics function
            numTerms = 0;
            termsEnum = terms.iterator();
            while (termsEnum.next() != null) {
                numTerms++;
            }
        }
        startField(fieldInfo, numTerms, hasPositions, hasOffsets, hasPayloads);
        termsEnum = terms.iterator();
        int termCount = 0;
        while (termsEnum.next() != null) {
            termCount++;
            final int freq = (int) termsEnum.totalTermFreq();
            startTerm(termsEnum.term(), freq);
            if (hasPositions || hasOffsets) {
                docsAndPositionsEnum = termsEnum.postings(docsAndPositionsEnum, PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);
                assert docsAndPositionsEnum != null;
                final int docID = docsAndPositionsEnum.nextDoc();
                assert docID != DocIdSetIterator.NO_MORE_DOCS;
                assert docsAndPositionsEnum.freq() == freq;
                for (int posUpto = 0; posUpto < freq; posUpto++) {
                    final int pos = docsAndPositionsEnum.nextPosition();
                    final int startOffset = docsAndPositionsEnum.startOffset();
                    final int endOffset = docsAndPositionsEnum.endOffset();
                    final BytesRef payload = docsAndPositionsEnum.getPayload();
                    assert !hasPositions || pos >= 0;
                    addPosition(pos, startOffset, endOffset, payload);
                }
            }
            finishTerm();
        }
        assert termCount == numTerms;
        finishField();
    }
    assert fieldCount == numFields;
    finishDocument();
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) FieldInfo(org.apache.lucene.index.FieldInfo) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 63 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class SimpleTextFieldsWriter method write.

public void write(FieldInfos fieldInfos, Fields fields) throws IOException {
    // for each field
    for (String field : fields) {
        Terms terms = fields.terms(field);
        if (terms == null) {
            // Annoyingly, this can happen!
            continue;
        }
        FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
        boolean wroteField = false;
        boolean hasPositions = terms.hasPositions();
        boolean hasFreqs = terms.hasFreqs();
        boolean hasPayloads = fieldInfo.hasPayloads();
        boolean hasOffsets = terms.hasOffsets();
        int flags = 0;
        if (hasPositions) {
            flags = PostingsEnum.POSITIONS;
            if (hasPayloads) {
                flags = flags | PostingsEnum.PAYLOADS;
            }
            if (hasOffsets) {
                flags = flags | PostingsEnum.OFFSETS;
            }
        } else {
            if (hasFreqs) {
                flags = flags | PostingsEnum.FREQS;
            }
        }
        TermsEnum termsEnum = terms.iterator();
        PostingsEnum postingsEnum = null;
        // for each term in field
        while (true) {
            BytesRef term = termsEnum.next();
            if (term == null) {
                break;
            }
            postingsEnum = termsEnum.postings(postingsEnum, flags);
            assert postingsEnum != null : "termsEnum=" + termsEnum + " hasPos=" + hasPositions + " flags=" + flags;
            boolean wroteTerm = false;
            // for each doc in field+term
            while (true) {
                int doc = postingsEnum.nextDoc();
                if (doc == PostingsEnum.NO_MORE_DOCS) {
                    break;
                }
                if (!wroteTerm) {
                    if (!wroteField) {
                        // we lazily do this, in case the field had
                        // no terms              
                        write(FIELD);
                        write(field);
                        newline();
                        wroteField = true;
                    }
                    // we lazily do this, in case the term had
                    // zero docs
                    write(TERM);
                    write(term);
                    newline();
                    wroteTerm = true;
                }
                write(DOC);
                write(Integer.toString(doc));
                newline();
                if (hasFreqs) {
                    int freq = postingsEnum.freq();
                    write(FREQ);
                    write(Integer.toString(freq));
                    newline();
                    if (hasPositions) {
                        // for assert:
                        int lastStartOffset = 0;
                        // for each pos in field+term+doc
                        for (int i = 0; i < freq; i++) {
                            int position = postingsEnum.nextPosition();
                            write(POS);
                            write(Integer.toString(position));
                            newline();
                            if (hasOffsets) {
                                int startOffset = postingsEnum.startOffset();
                                int endOffset = postingsEnum.endOffset();
                                assert endOffset >= startOffset;
                                assert startOffset >= lastStartOffset : "startOffset=" + startOffset + " lastStartOffset=" + lastStartOffset;
                                lastStartOffset = startOffset;
                                write(START_OFFSET);
                                write(Integer.toString(startOffset));
                                newline();
                                write(END_OFFSET);
                                write(Integer.toString(endOffset));
                                newline();
                            }
                            BytesRef payload = postingsEnum.getPayload();
                            if (payload != null && payload.length > 0) {
                                assert payload.length != 0;
                                write(PAYLOAD);
                                write(payload);
                                newline();
                            }
                        }
                    }
                }
            }
        }
    }
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) FieldInfo(org.apache.lucene.index.FieldInfo) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 64 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class FieldOffsetStrategy method createOffsetsEnumsFromReader.

protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
    final Terms termsIndex = leafReader.terms(field);
    if (termsIndex == null) {
        return Collections.emptyList();
    }
    // For strict positions, get a Map of term to Spans:
    //    note: ScriptPhraseHelper.NONE does the right thing for these method calls
    final Map<BytesRef, Spans> strictPhrasesTermToSpans = phraseHelper.getTermToSpans(leafReader, doc);
    // Usually simply wraps terms in a List; but if willRewrite() then can be expanded
    final List<BytesRef> sourceTerms = phraseHelper.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
    final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + automata.length);
    // Handle sourceTerms:
    if (!sourceTerms.isEmpty()) {
        //does not return null
        TermsEnum termsEnum = termsIndex.iterator();
        for (BytesRef term : sourceTerms) {
            if (termsEnum.seekExact(term)) {
                PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
                if (postingsEnum == null) {
                    // no offsets or positions available
                    throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
                }
                if (doc == postingsEnum.advance(doc)) {
                    // now it's positioned, although may be exhausted
                    postingsEnum = phraseHelper.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
                    if (postingsEnum != null) {
                        offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
                    }
                }
            }
        }
    }
    // Handle automata
    if (automata.length > 0) {
        offsetsEnums.addAll(createAutomataOffsetsFromTerms(termsIndex, doc));
    }
    return offsetsEnums;
}
Also used : Terms(org.apache.lucene.index.Terms) ArrayList(java.util.ArrayList) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) Spans(org.apache.lucene.search.spans.Spans) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 65 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TokenStreamOffsetStrategy method getOffsetsEnums.

@Override
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
    TokenStream tokenStream = tokenStream(content);
    PostingsEnum mtqPostingsEnum = new TokenStreamPostingsEnum(tokenStream, automata);
    mtqPostingsEnum.advance(docId);
    return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PostingsEnum(org.apache.lucene.index.PostingsEnum)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)80 BytesRef (org.apache.lucene.util.BytesRef)59 TermsEnum (org.apache.lucene.index.TermsEnum)56 Terms (org.apache.lucene.index.Terms)47 Fields (org.apache.lucene.index.Fields)18 LeafReader (org.apache.lucene.index.LeafReader)17 Term (org.apache.lucene.index.Term)17 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)15 Document (org.apache.lucene.document.Document)13 ArrayList (java.util.ArrayList)12 Bits (org.apache.lucene.util.Bits)11 IndexReader (org.apache.lucene.index.IndexReader)10 TextField (org.apache.lucene.document.TextField)9 Directory (org.apache.lucene.store.Directory)9 IOException (java.io.IOException)8 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)5