Search in sources :

Example 46 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project pyramid by cheng-li.

the class ESIndex method getTermVectorWithException.

private Map<Integer, String> getTermVectorWithException(String field, String id) throws IOException {
    TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(true).setFieldStatistics(false).setTermStatistics(false).setSelectedFields(field).execute().actionGet();
    Map<Integer, String> map = new HashMap<>();
    Terms terms = response.getFields().terms(field);
    if (terms == null) {
        return map;
    }
    TermsEnum iterator = terms.iterator();
    PostingsEnum postings = null;
    for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) {
        String term = termBytes.utf8ToString();
        postings = iterator.postings(postings, PostingsEnum.ALL);
        // there can only be one doc since we are getting with id. get the doc and the position
        postings.nextDoc();
        int tf = postings.freq();
        for (int i = 0; i < tf; i++) {
            int pos = postings.nextPosition();
            map.put(pos, term);
        }
    }
    return map;
}
Also used : TermVectorsResponse(org.elasticsearch.action.termvectors.TermVectorsResponse) HashMap(java.util.HashMap) Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 47 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project pyramid by cheng-li.

the class ESIndex method getTermStats.

/**
 * df is from one shard!!!
 * @param id
 * @return term statistics from one doc
 * @throws IOException
 */
public Set<TermStat> getTermStats(String field, String id) throws IOException {
    StopWatch stopWatch = null;
    if (logger.isDebugEnabled()) {
        stopWatch = new StopWatch();
        stopWatch.start();
    }
    TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id).setOffsets(false).setPositions(false).setFieldStatistics(false).setTermStatistics(true).setSelectedFields(field).execute().actionGet();
    Terms terms = response.getFields().terms(field);
    Set<TermStat> set = new HashSet<>();
    // if the field is empty, terms==null
    if (terms == null) {
        return set;
    }
    TermsEnum iterator = terms.iterator();
    PostingsEnum postings = null;
    for (int i = 0; i < terms.size(); i++) {
        String term = iterator.next().utf8ToString();
        postings = iterator.postings(postings);
        int tf = postings.freq();
        int df = iterator.docFreq();
        ClassicSimilarity defaultSimilarity = new ClassicSimilarity();
        /**
         * from lucene
         */
        /**
         * tf is just tf, not square root of tf as in lucene
         */
        /**
         * Implemented as <code>log(numDocs/(docFreq+1)) + 1</code>.
         */
        float tfidf = tf * defaultSimilarity.idf(df, this.numDocs);
        TermStat termStat = new TermStat(term);
        termStat.setTf(tf).setDf(df).setTfidf(tfidf);
        set.add(termStat);
    }
    if (logger.isDebugEnabled()) {
        logger.debug("time spent on getNgramInfos for " + id + " = " + stopWatch);
    }
    return set;
}
Also used : TermVectorsResponse(org.elasticsearch.action.termvectors.TermVectorsResponse) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) StopWatch(org.apache.commons.lang3.time.StopWatch) HashSet(java.util.HashSet) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 48 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project crate by crate.

the class PrunePostingsMergePolicy method wrapReader.

private static CodecReader wrapReader(CodecReader reader, String idField) {
    Bits liveDocs = reader.getLiveDocs();
    if (liveDocs == null) {
        // no deleted docs - we are good!
        return reader;
    }
    final boolean fullyDeletedSegment = reader.numDocs() == 0;
    return new FilterCodecReader(reader) {

        @Override
        public FieldsProducer getPostingsReader() {
            FieldsProducer postingsReader = super.getPostingsReader();
            if (postingsReader == null) {
                return null;
            }
            return new FieldsProducer() {

                @Override
                public void close() throws IOException {
                    postingsReader.close();
                }

                @Override
                public void checkIntegrity() throws IOException {
                    postingsReader.checkIntegrity();
                }

                @Override
                public Iterator<String> iterator() {
                    return postingsReader.iterator();
                }

                @Override
                public Terms terms(String field) throws IOException {
                    Terms in = postingsReader.terms(field);
                    if (idField.equals(field) && in != null) {
                        return new FilterLeafReader.FilterTerms(in) {

                            @Override
                            public TermsEnum iterator() throws IOException {
                                TermsEnum iterator = super.iterator();
                                return new FilteredTermsEnum(iterator, false) {

                                    private PostingsEnum internal;

                                    @Override
                                    protected AcceptStatus accept(BytesRef term) throws IOException {
                                        if (fullyDeletedSegment) {
                                            // short-cut this if we don't match anything
                                            return AcceptStatus.END;
                                        }
                                        internal = postings(internal, PostingsEnum.NONE);
                                        if (internal.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                                            return AcceptStatus.YES;
                                        }
                                        return AcceptStatus.NO;
                                    }

                                    @Override
                                    public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException {
                                        if (reuse instanceof OnlyLiveDocsPostingsEnum) {
                                            OnlyLiveDocsPostingsEnum reuseInstance = (OnlyLiveDocsPostingsEnum) reuse;
                                            reuseInstance.reset(super.postings(reuseInstance.in, flags));
                                            return reuseInstance;
                                        }
                                        return new OnlyLiveDocsPostingsEnum(super.postings(null, flags), liveDocs);
                                    }

                                    @Override
                                    public ImpactsEnum impacts(int flags) throws IOException {
                                        throw new UnsupportedOperationException();
                                    }
                                };
                            }
                        };
                    } else {
                        return in;
                    }
                }

                @Override
                public int size() {
                    return postingsReader.size();
                }

                @Override
                public long ramBytesUsed() {
                    return postingsReader.ramBytesUsed();
                }
            };
        }

        @Override
        public CacheHelper getCoreCacheHelper() {
            return null;
        }

        @Override
        public CacheHelper getReaderCacheHelper() {
            return null;
        }
    };
}
Also used : FieldsProducer(org.apache.lucene.codecs.FieldsProducer) FilteredTermsEnum(org.apache.lucene.index.FilteredTermsEnum) Terms(org.apache.lucene.index.Terms) TermsEnum(org.apache.lucene.index.TermsEnum) FilteredTermsEnum(org.apache.lucene.index.FilteredTermsEnum) Bits(org.apache.lucene.util.Bits) PostingsEnum(org.apache.lucene.index.PostingsEnum) FilterCodecReader(org.apache.lucene.index.FilterCodecReader) BytesRef(org.apache.lucene.util.BytesRef)

Example 49 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.

the class TermVectorsFilter method selectBestTerms.

public void selectBestTerms() throws IOException {
    PostingsEnum docsEnum = null;
    for (String fieldName : fields) {
        if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
            continue;
        }
        Terms terms = fields.terms(fieldName);
        Terms topLevelTerms = topLevelFields.terms(fieldName);
        // if no terms found, take the retrieved term vector fields for stats
        if (topLevelTerms == null) {
            topLevelTerms = terms;
        }
        long numDocs = getDocCount(fieldName, topLevelTerms);
        // one queue per field name
        ScoreTermsQueue queue = new ScoreTermsQueue(Math.min(maxNumTerms, (int) terms.size()));
        // select terms with highest tf-idf
        TermsEnum termsEnum = terms.iterator();
        TermsEnum topLevelTermsEnum = topLevelTerms.iterator();
        while (termsEnum.next() != null) {
            BytesRef termBytesRef = termsEnum.term();
            boolean foundTerm = topLevelTermsEnum.seekExact(termBytesRef);
            assert foundTerm : "Term: " + termBytesRef.utf8ToString() + " not found!";
            Term term = new Term(fieldName, termBytesRef);
            // remove noise words
            int freq = getTermFreq(termsEnum, docsEnum);
            if (isNoise(term.bytes().utf8ToString(), freq)) {
                continue;
            }
            // now call on docFreq
            long docFreq = getTermStatistics(topLevelTermsEnum, term).docFreq();
            if (!isAccepted(docFreq)) {
                continue;
            }
            // filter based on score
            float score = computeScore(docFreq, freq, numDocs);
            queue.addOrUpdate(new ScoreTerm(term.field(), term.bytes().utf8ToString(), score));
        }
        // retain the best terms for quick lookups
        ScoreTerm scoreTerm;
        int count = 0;
        while ((scoreTerm = queue.pop()) != null) {
            scoreTerms.put(new Term(scoreTerm.field, scoreTerm.word), scoreTerm);
            count++;
        }
        sizes.put(fieldName, count);
    }
}
Also used : Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 50 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.

the class TermVectorsWriter method setFields.

void setFields(Fields termVectorsByField, Set<String> selectedFields, EnumSet<Flag> flags, Fields topLevelFields, @Nullable AggregatedDfs dfs, @Nullable TermVectorsFilter termVectorsFilter) throws IOException {
    int numFieldsWritten = 0;
    PostingsEnum docsAndPosEnum = null;
    PostingsEnum docsEnum = null;
    boolean hasScores = termVectorsFilter != null;
    for (String field : termVectorsByField) {
        if ((selectedFields != null) && (!selectedFields.contains(field))) {
            continue;
        }
        Terms fieldTermVector = termVectorsByField.terms(field);
        Terms topLevelTerms = topLevelFields.terms(field);
        // if no terms found, take the retrieved term vector fields for stats
        if (topLevelTerms == null) {
            topLevelTerms = EMPTY_TERMS;
        }
        TermsEnum topLevelIterator = topLevelTerms.iterator();
        boolean positions = flags.contains(Flag.Positions) && fieldTermVector.hasPositions();
        boolean offsets = flags.contains(Flag.Offsets) && fieldTermVector.hasOffsets();
        boolean payloads = flags.contains(Flag.Payloads) && fieldTermVector.hasPayloads();
        long termsSize = fieldTermVector.size();
        if (hasScores) {
            termsSize = Math.min(termsSize, termVectorsFilter.size(field));
        }
        startField(field, termsSize, positions, offsets, payloads);
        if (flags.contains(Flag.FieldStatistics)) {
            if (dfs != null) {
                writeFieldStatistics(dfs.fieldStatistics().get(field));
            } else {
                writeFieldStatistics(topLevelTerms);
            }
        }
        TermsEnum iterator = fieldTermVector.iterator();
        final boolean useDocsAndPos = positions || offsets || payloads;
        while (iterator.next() != null) {
            // iterate all terms of the current field
            BytesRef termBytesRef = iterator.term();
            Term term = new Term(field, termBytesRef);
            // with filtering we only keep the best terms
            if (hasScores && !termVectorsFilter.hasScoreTerm(term)) {
                continue;
            }
            startTerm(termBytesRef);
            if (flags.contains(Flag.TermStatistics)) {
                // get the doc frequency
                if (dfs != null) {
                    final TermStatistics statistics = dfs.termStatistics().get(term);
                    writeTermStatistics(statistics == null ? new TermStatistics(termBytesRef, 0, 0) : statistics);
                } else {
                    boolean foundTerm = topLevelIterator.seekExact(termBytesRef);
                    if (foundTerm) {
                        writeTermStatistics(topLevelIterator);
                    } else {
                        writeTermStatistics(new TermStatistics(termBytesRef, 0, 0));
                    }
                }
            }
            if (useDocsAndPos) {
                // given we have pos or offsets
                docsAndPosEnum = writeTermWithDocsAndPos(iterator, docsAndPosEnum, positions, offsets, payloads);
            } else {
                // if we do not have the positions stored, we need to
                // get the frequency from a PostingsEnum.
                docsEnum = writeTermWithDocsOnly(iterator, docsEnum);
            }
            if (hasScores) {
                writeScoreTerm(termVectorsFilter.getScoreTerm(term));
            }
        }
        numFieldsWritten++;
    }
    response.setTermVectorsField(output);
    response.setHeader(writeHeader(numFieldsWritten, flags.contains(Flag.TermStatistics), flags.contains(Flag.FieldStatistics), hasScores));
}
Also used : Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) PostingsEnum(org.apache.lucene.index.PostingsEnum) TermStatistics(org.apache.lucene.search.TermStatistics) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)80 BytesRef (org.apache.lucene.util.BytesRef)59 TermsEnum (org.apache.lucene.index.TermsEnum)56 Terms (org.apache.lucene.index.Terms)47 Fields (org.apache.lucene.index.Fields)18 LeafReader (org.apache.lucene.index.LeafReader)17 Term (org.apache.lucene.index.Term)17 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)15 Document (org.apache.lucene.document.Document)13 ArrayList (java.util.ArrayList)12 Bits (org.apache.lucene.util.Bits)11 IndexReader (org.apache.lucene.index.IndexReader)10 TextField (org.apache.lucene.document.TextField)9 Directory (org.apache.lucene.store.Directory)9 IOException (java.io.IOException)8 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)5