Search in sources :

Example 76 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestMockAnalyzer method testChangeGaps.

public void testChangeGaps() throws Exception {
    // LUCENE-5324: check that it is possible to change the wrapper's gaps
    final int positionGap = random().nextInt(1000);
    final int offsetGap = random().nextInt(1000);
    final Analyzer delegate = new MockAnalyzer(random());
    final Analyzer a = new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) {

        @Override
        protected Analyzer getWrappedAnalyzer(String fieldName) {
            return delegate;
        }

        @Override
        public int getPositionIncrementGap(String fieldName) {
            return positionGap;
        }

        @Override
        public int getOffsetGap(String fieldName) {
            return offsetGap;
        }
    };
    final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a);
    final Document doc = new Document();
    final FieldType ft = new FieldType();
    ft.setIndexOptions(IndexOptions.DOCS);
    ft.setTokenized(true);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
    doc.add(new Field("f", "a", ft));
    doc.add(new Field("f", "a", ft));
    writer.addDocument(doc);
    final LeafReader reader = getOnlyLeafReader(writer.getReader());
    final Fields fields = reader.getTermVectors(0);
    final Terms terms = fields.terms("f");
    final TermsEnum te = terms.iterator();
    assertEquals(new BytesRef("a"), te.next());
    final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL);
    assertEquals(0, dpe.nextDoc());
    assertEquals(2, dpe.freq());
    assertEquals(0, dpe.nextPosition());
    assertEquals(0, dpe.startOffset());
    final int endOffset = dpe.endOffset();
    assertEquals(1 + positionGap, dpe.nextPosition());
    assertEquals(1 + endOffset + offsetGap, dpe.endOffset());
    assertEquals(null, te.next());
    reader.close();
    writer.close();
    writer.w.getDirectory().close();
}
Also used : LeafReader(org.apache.lucene.index.LeafReader) Terms(org.apache.lucene.index.Terms) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TermsEnum(org.apache.lucene.index.TermsEnum) Field(org.apache.lucene.document.Field) Fields(org.apache.lucene.index.Fields) PostingsEnum(org.apache.lucene.index.PostingsEnum) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef)

Example 77 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project Anserini by castorini.

the class PMIFeatureExtractor method extract.

@Override
public float extract(Document doc, Terms terms, RerankerContext context) {
    // and also doc freqs of each pair
    if (!this.lastQueryProcessed.equals(context.getQueryText())) {
        this.lastQueryProcessed = context.getQueryText();
        this.lastComputedValue = 0.0f;
        Set<String> querySet = new HashSet<>(context.getQueryTokens());
        IndexReader reader = context.getIndexSearcher().getIndexReader();
        Map<String, Integer> docFreqs = new HashMap<>();
        List<String> queryTokens = new ArrayList<>(querySet);
        try {
            for (String token : querySet) {
                docFreqs.put(token, reader.docFreq(new Term(context.getField(), token)));
            }
            float sumPMI = 0.0f;
            float pairsComputed = 0.0f;
            for (int i = 0; i < queryTokens.size(); i++) {
                String firstToken = queryTokens.get(i);
                for (int j = i + 1; j < queryTokens.size(); j++) {
                    pairsComputed++;
                    String secondToken = queryTokens.get(j);
                    PostingsEnum firstEnum = MultiFields.getTermDocsEnum(reader, context.getField(), new BytesRef(firstToken));
                    PostingsEnum secondEnum = MultiFields.getTermDocsEnum(reader, context.getField(), new BytesRef(secondToken));
                    int intersect;
                    if (firstEnum == null || secondEnum == null) {
                        intersect = 0;
                    } else {
                        intersect = countPostingIntersect(firstEnum, secondEnum);
                    }
                    if (intersect == 0)
                        continue;
                    // We should never reach this point and have doc freq =0 because then there would
                    // be no intersect between docIds
                    int firstDocFreq = docFreqs.containsKey(firstToken) ? docFreqs.get(firstToken) : 1;
                    int secondDocFreq = docFreqs.containsKey(secondToken) ? docFreqs.get(secondToken) : 1;
                    float fraction = (intersect / (float) (firstDocFreq * secondDocFreq));
                    if (fraction <= 0) {
                        continue;
                    }
                    sumPMI += Math.log(fraction);
                }
            }
            // Now compute the average
            if (pairsComputed != 0) {
                this.lastComputedValue = sumPMI / pairsComputed;
            }
        } catch (IOException e) {
            this.lastComputedValue = 0.0f;
        }
    }
    return this.lastComputedValue;
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) IndexReader(org.apache.lucene.index.IndexReader) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) HashSet(java.util.HashSet)

Example 78 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project SearchServices by Alfresco.

the class AlfrescoLukeRequestHandler method getFirstLiveDoc.

// Just get a document with the term in it, the first one will do!
// Is there a better way to do this? Shouldn't actually be very costly
// to do it this way.
private static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException {
    PostingsEnum postingsEnum = null;
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    // documents. Is there a better way?
    for (int idx = 0; idx < 1000 && postingsEnum == null; ++idx) {
        text = termsEnum.next();
        if (text == null) {
            // them.
            return null;
        }
        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
        final Bits liveDocs = reader.getLiveDocs();
        if (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            if (liveDocs != null && liveDocs.get(postingsEnum.docID())) {
                continue;
            }
            return reader.document(postingsEnum.docID());
        }
    }
    return null;
}
Also used : Bits(org.apache.lucene.util.Bits) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 79 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project SearchServices by Alfresco.

the class SolrPathScorer method createPathScorer.

public static SolrPathScorer createPathScorer(SolrPathQuery solrPathQuery, LeafReaderContext context, Weight weight, DictionaryService dictionarySertvice, boolean repeat) throws IOException {
    if (solrPathQuery.getPathStructuredFieldPositions().size() == 0) {
        ArrayList<StructuredFieldPosition> answer = new ArrayList<StructuredFieldPosition>(2);
        answer.add(new SelfAxisStructuredFieldPosition());
        answer.add(new SelfAxisStructuredFieldPosition());
        solrPathQuery.appendQuery(answer);
    }
    for (StructuredFieldPosition sfp : solrPathQuery.getPathStructuredFieldPositions()) {
        if (sfp.getTermText() != null) {
            PostingsEnum p = context.reader().postings(new Term(solrPathQuery.getPathField(), sfp.getTermText()), PostingsEnum.POSITIONS);
            if (p == null)
                return null;
            CachingTermPositions ctp = new CachingTermPositions(p);
            sfp.setCachingTermPositions(ctp);
        }
    }
    SolrContainerScorer cs = null;
    PostingsEnum rootContainerPositions = null;
    if (solrPathQuery.getPathRootTerm() != null) {
        rootContainerPositions = context.reader().postings(solrPathQuery.getPathRootTerm(), PostingsEnum.POSITIONS);
    }
    if (solrPathQuery.getPathStructuredFieldPositions().size() > 0) {
        cs = new SolrContainerScorer(weight, rootContainerPositions, (StructuredFieldPosition[]) solrPathQuery.getPathStructuredFieldPositions().toArray(new StructuredFieldPosition[] {}));
    }
    return new SolrPathScorer(weight, cs);
}
Also used : ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) PostingsEnum(org.apache.lucene.index.PostingsEnum)

Example 80 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project crate by crate.

the class ShardSplittingQuery method findSplitDocs.

private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader, IntConsumer consumer) throws IOException {
    Terms terms = leafReader.terms(idField);
    TermsEnum iterator = terms.iterator();
    BytesRef idTerm;
    PostingsEnum postingsEnum = null;
    while ((idTerm = iterator.next()) != null) {
        if (includeInShard.test(idTerm) == false) {
            postingsEnum = iterator.postings(postingsEnum);
            int doc;
            while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                consumer.accept(doc);
            }
        }
    }
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)80 BytesRef (org.apache.lucene.util.BytesRef)59 TermsEnum (org.apache.lucene.index.TermsEnum)56 Terms (org.apache.lucene.index.Terms)47 Fields (org.apache.lucene.index.Fields)18 LeafReader (org.apache.lucene.index.LeafReader)17 Term (org.apache.lucene.index.Term)17 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)15 Document (org.apache.lucene.document.Document)13 ArrayList (java.util.ArrayList)12 Bits (org.apache.lucene.util.Bits)11 IndexReader (org.apache.lucene.index.IndexReader)10 TextField (org.apache.lucene.document.TextField)9 Directory (org.apache.lucene.store.Directory)9 IOException (java.io.IOException)8 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)5