Search in sources :

Example 81 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestMockAnalyzer method testChangeGaps.

public void testChangeGaps() throws Exception {
    // LUCENE-5324: check that it is possible to change the wrapper's gaps
    final int positionGap = random().nextInt(1000);
    final int offsetGap = random().nextInt(1000);
    final Analyzer delegate = new MockAnalyzer(random());
    final Analyzer a = new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) {

        @Override
        protected Analyzer getWrappedAnalyzer(String fieldName) {
            return delegate;
        }

        @Override
        public int getPositionIncrementGap(String fieldName) {
            return positionGap;
        }

        @Override
        public int getOffsetGap(String fieldName) {
            return offsetGap;
        }
    };
    final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a);
    final Document doc = new Document();
    final FieldType ft = new FieldType();
    ft.setIndexOptions(IndexOptions.DOCS);
    ft.setTokenized(true);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
    doc.add(new Field("f", "a", ft));
    doc.add(new Field("f", "a", ft));
    writer.addDocument(doc);
    final LeafReader reader = getOnlyLeafReader(writer.getReader());
    final Fields fields = reader.getTermVectors(0);
    final Terms terms = fields.terms("f");
    final TermsEnum te = terms.iterator();
    assertEquals(new BytesRef("a"), te.next());
    final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL);
    assertEquals(0, dpe.nextDoc());
    assertEquals(2, dpe.freq());
    assertEquals(0, dpe.nextPosition());
    assertEquals(0, dpe.startOffset());
    final int endOffset = dpe.endOffset();
    assertEquals(1 + positionGap, dpe.nextPosition());
    assertEquals(1 + endOffset + offsetGap, dpe.endOffset());
    assertEquals(null, te.next());
    reader.close();
    writer.close();
    writer.w.getDirectory().close();
}
Also used : LeafReader(org.apache.lucene.index.LeafReader) Terms(org.apache.lucene.index.Terms) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TermsEnum(org.apache.lucene.index.TermsEnum) Field(org.apache.lucene.document.Field) Fields(org.apache.lucene.index.Fields) PostingsEnum(org.apache.lucene.index.PostingsEnum) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef)

Example 82 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project Anserini by castorini.

the class PMIFeatureExtractor method extract.

@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
    // and also doc freqs of each pair
    if (!this.lastQueryProcessed.equals(context.getQueryText())) {
        this.lastQueryProcessed = context.getQueryText();
        this.lastComputedValue = 0.0f;
        Set<String> querySet = new HashSet<>(context.getQueryTokens());
        IndexReader reader = context.getIndexSearcher().getIndexReader();
        Map<String, Integer> docFreqs = new HashMap<>();
        List<String> queryTokens = new ArrayList<>(querySet);
        try {
            for (String token : querySet) {
                docFreqs.put(token, reader.docFreq(new Term(context.getField(), token)));
            }
            float sumPMI = 0.0f;
            float pairsComputed = 0.0f;
            for (int i = 0; i < queryTokens.size(); i++) {
                String firstToken = queryTokens.get(i);
                for (int j = i + 1; j < queryTokens.size(); j++) {
                    pairsComputed++;
                    String secondToken = queryTokens.get(j);
                    PostingsEnum firstEnum = MultiFields.getTermDocsEnum(reader, context.getField(), new BytesRef(firstToken));
                    PostingsEnum secondEnum = MultiFields.getTermDocsEnum(reader, context.getField(), new BytesRef(secondToken));
                    int intersect;
                    if (firstEnum == null || secondEnum == null) {
                        intersect = 0;
                    } else {
                        intersect = countPostingIntersect(firstEnum, secondEnum);
                    }
                    if (intersect == 0)
                        continue;
                    // We should never reach this point and have doc freq =0 because then there would
                    // be no intersect between docIds
                    int firstDocFreq = docFreqs.containsKey(firstToken) ? docFreqs.get(firstToken) : 1;
                    int secondDocFreq = docFreqs.containsKey(secondToken) ? docFreqs.get(secondToken) : 1;
                    float fraction = (intersect / (float) (firstDocFreq * secondDocFreq));
                    if (fraction <= 0) {
                        continue;
                    }
                    sumPMI += Math.log(fraction);
                }
            }
            // Now compute the average
            if (pairsComputed != 0) {
                this.lastComputedValue = sumPMI / pairsComputed;
            }
        } catch (IOException e) {
            this.lastComputedValue = 0.0f;
        }
    }
    return this.lastComputedValue;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) IndexReader(org.apache.lucene.index.IndexReader) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Example 83 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project SearchServices by Alfresco.

the class AlfrescoLukeRequestHandler method getFirstLiveDoc.

// Just get a document with the term in it, the first one will do!
// Is there a better way to do this? Shouldn't actually be very costly
// to do it this way.
private static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException {
    PostingsEnum postingsEnum = null;
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    // documents. Is there a better way?
    for (int idx = 0; idx < 1000 && postingsEnum == null; ++idx) {
        text = termsEnum.next();
        if (text == null) {
            // them.
            return null;
        }
        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
        final Bits liveDocs = reader.getLiveDocs();
        if (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            if (liveDocs != null && liveDocs.get(postingsEnum.docID())) {
                continue;
            }
            return reader.document(postingsEnum.docID());
        }
    }
    return null;
}
Also used : Bits(org.apache.lucene.util.Bits) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 84 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project SearchServices by Alfresco.

the class SolrPathScorer method createPathScorer.

public static SolrPathScorer createPathScorer(SolrPathQuery solrPathQuery, LeafReaderContext context, Weight weight, DictionaryService dictionarySertvice, boolean repeat) throws IOException {
    if (solrPathQuery.getPathStructuredFieldPositions().size() == 0) {
        ArrayList<StructuredFieldPosition> answer = new ArrayList<StructuredFieldPosition>(2);
        answer.add(new SelfAxisStructuredFieldPosition());
        answer.add(new SelfAxisStructuredFieldPosition());
        solrPathQuery.appendQuery(answer);
    }
    for (StructuredFieldPosition sfp : solrPathQuery.getPathStructuredFieldPositions()) {
        if (sfp.getTermText() != null) {
            PostingsEnum p = context.reader().postings(new Term(solrPathQuery.getPathField(), sfp.getTermText()), PostingsEnum.POSITIONS);
            if (p == null)
                return null;
            CachingTermPositions ctp = new CachingTermPositions(p);
            sfp.setCachingTermPositions(ctp);
        }
    }
    SolrContainerScorer cs = null;
    PostingsEnum rootContainerPositions = null;
    if (solrPathQuery.getPathRootTerm() != null) {
        rootContainerPositions = context.reader().postings(solrPathQuery.getPathRootTerm(), PostingsEnum.POSITIONS);
    }
    if (solrPathQuery.getPathStructuredFieldPositions().size() > 0) {
        cs = new SolrContainerScorer(weight, rootContainerPositions, (StructuredFieldPosition[]) solrPathQuery.getPathStructuredFieldPositions().toArray(new StructuredFieldPosition[] {}));
    }
    return new SolrPathScorer(weight, cs);
}
Also used : ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) PostingsEnum(org.apache.lucene.index.PostingsEnum)

Example 85 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project Anserini by castorini.

the class DocumentFieldContext method getPostings.

public Map<Integer, List<Integer>> getPostings(String term) {
    if (postings.containsKey(term)) {
        return postings.get(term);
    } else {
        Map<Integer, List<Integer>> posting = new HashMap<>();
        try {
            Term t = new Term(fieldName, term);
            PostingsEnum postingsEnum = MultiTerms.getTermPostingsEnum(reader, fieldName, t.bytes(), PostingsEnum.POSITIONS);
            if (postingsEnum != null) {
                int docId;
                while ((docId = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    List<Integer> postions = new ArrayList<>();
                    int freq = postingsEnum.freq();
                    for (int i = 0; i < freq; i++) {
                        postions.add(postingsEnum.nextPosition());
                    }
                    posting.put(docId, postions);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        postings.put(term, posting);
        return posting;
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) PostingsEnum(org.apache.lucene.index.PostingsEnum)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)89 TermsEnum (org.apache.lucene.index.TermsEnum)62 BytesRef (org.apache.lucene.util.BytesRef)62 Terms (org.apache.lucene.index.Terms)51 Term (org.apache.lucene.index.Term)23 Fields (org.apache.lucene.index.Fields)18 ArrayList (java.util.ArrayList)17 LeafReader (org.apache.lucene.index.LeafReader)17 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)16 Document (org.apache.lucene.document.Document)13 IOException (java.io.IOException)12 IndexReader (org.apache.lucene.index.IndexReader)12 Bits (org.apache.lucene.util.Bits)11 Directory (org.apache.lucene.store.Directory)10 TextField (org.apache.lucene.document.TextField)9 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 HashMap (java.util.HashMap)6 List (java.util.List)6 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6