Search in sources :

Example 11 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project elasticsearch by elastic.

the class GetTermVectorsIT method compareTermVectors.

private void compareTermVectors(String fieldName, Fields fields0, Fields fields1) throws IOException {
    Terms terms0 = fields0.terms(fieldName);
    Terms terms1 = fields1.terms(fieldName);
    assertThat(terms0, notNullValue());
    assertThat(terms1, notNullValue());
    assertThat(terms0.size(), equalTo(terms1.size()));
    TermsEnum iter0 = terms0.iterator();
    TermsEnum iter1 = terms1.iterator();
    for (int i = 0; i < terms0.size(); i++) {
        BytesRef next0 = iter0.next();
        assertThat(next0, notNullValue());
        BytesRef next1 = iter1.next();
        assertThat(next1, notNullValue());
        // compare field value
        String string0 = next0.utf8ToString();
        String string1 = next1.utf8ToString();
        assertThat("expected: " + string0, string0, equalTo(string1));
        // compare df and ttf
        assertThat("term: " + string0, iter0.docFreq(), equalTo(iter1.docFreq()));
        assertThat("term: " + string0, iter0.totalTermFreq(), equalTo(iter1.totalTermFreq()));
        // compare freq and docs
        PostingsEnum docsAndPositions0 = iter0.postings(null, PostingsEnum.ALL);
        PostingsEnum docsAndPositions1 = iter1.postings(null, PostingsEnum.ALL);
        assertThat("term: " + string0, docsAndPositions0.nextDoc(), equalTo(docsAndPositions1.nextDoc()));
        assertThat("term: " + string0, docsAndPositions0.freq(), equalTo(docsAndPositions1.freq()));
        // compare position, start offsets and end offsets
        for (int j = 0; j < docsAndPositions0.freq(); j++) {
            assertThat("term: " + string0, docsAndPositions0.nextPosition(), equalTo(docsAndPositions1.nextPosition()));
            assertThat("term: " + string0, docsAndPositions0.startOffset(), equalTo(docsAndPositions1.startOffset()));
            assertThat("term: " + string0, docsAndPositions0.endOffset(), equalTo(docsAndPositions1.endOffset()));
        }
    }
    assertThat(iter0.next(), nullValue());
    assertThat(iter1.next(), nullValue());
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 12 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project textdb by TextDB.

the class DataReader method buildPayloadFromTermVector.

private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
    ArrayList<Span> payloadSpanList = new ArrayList<>();
    for (Attribute attr : inputSchema.getAttributes()) {
        String attributeName = attr.getAttributeName();
        AttributeType attributeType = attr.getAttributeType();
        // payload.
        if (attributeType != AttributeType.TEXT) {
            continue;
        }
        String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
        Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
        if (termVector == null) {
            continue;
        }
        TermsEnum termsEnum = termVector.iterator();
        PostingsEnum termPostings = null;
        // go through document terms
        while ((termsEnum.next()) != null) {
            termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
            if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
                continue;
            }
            // for each term, go through its postings
            for (int i = 0; i < termPostings.freq(); i++) {
                // nextPosition needs to be called first
                int tokenPosition = termPostings.nextPosition();
                int charStart = termPostings.startOffset();
                int charEnd = termPostings.endOffset();
                String analyzedTermStr = termsEnum.term().utf8ToString();
                String originalTermStr = fieldValue.substring(charStart, charEnd);
                Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
                payloadSpanList.add(span);
            }
        }
    }
    return payloadSpanList;
}
Also used : Attribute(edu.uci.ics.textdb.api.schema.Attribute) AttributeType(edu.uci.ics.textdb.api.schema.AttributeType) ArrayList(java.util.ArrayList) Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) Span(edu.uci.ics.textdb.api.span.Span) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 13 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestBlockPostingsFormat3 method assertTermsEnum.

/** 
   * checks the terms enum sequentially
   * if deep is false, it does a 'shallow' test that doesnt go down to the docsenums
   */
public void assertTermsEnum(TermsEnum leftTermsEnum, TermsEnum rightTermsEnum, boolean deep, boolean hasPositions) throws Exception {
    BytesRef term;
    PostingsEnum leftPositions = null;
    PostingsEnum rightPositions = null;
    PostingsEnum leftDocs = null;
    PostingsEnum rightDocs = null;
    while ((term = leftTermsEnum.next()) != null) {
        assertEquals(term, rightTermsEnum.next());
        assertTermStats(leftTermsEnum, rightTermsEnum);
        if (deep) {
            if (hasPositions) {
                // with payloads + off
                assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.ALL), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.ALL));
                assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.ALL), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.ALL));
                // with payloads only
                assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.PAYLOADS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.PAYLOADS));
                assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.PAYLOADS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.PAYLOADS));
                // with offsets only
                assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.OFFSETS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.OFFSETS));
                assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.OFFSETS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.OFFSETS));
                // with positions only
                assertDocsAndPositionsEnum(leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.POSITIONS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.POSITIONS));
                assertPositionsSkipping(leftTermsEnum.docFreq(), leftPositions = leftTermsEnum.postings(leftPositions, PostingsEnum.POSITIONS), rightPositions = rightTermsEnum.postings(rightPositions, PostingsEnum.POSITIONS));
            }
            // with freqs:
            assertDocsEnum(leftDocs = leftTermsEnum.postings(leftDocs), rightDocs = rightTermsEnum.postings(rightDocs));
            // w/o freqs:
            assertDocsEnum(leftDocs = leftTermsEnum.postings(leftDocs, PostingsEnum.NONE), rightDocs = rightTermsEnum.postings(rightDocs, PostingsEnum.NONE));
            // with freqs:
            assertDocsSkipping(leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.postings(leftDocs), rightDocs = rightTermsEnum.postings(rightDocs));
            // w/o freqs:
            assertDocsSkipping(leftTermsEnum.docFreq(), leftDocs = leftTermsEnum.postings(leftDocs, PostingsEnum.NONE), rightDocs = rightTermsEnum.postings(rightDocs, PostingsEnum.NONE));
        }
    }
    assertNull(rightTermsEnum.next());
}
Also used : PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Example 14 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestMultiPhraseEnum method testOneDocument.

/** Tests union on one document  */
public void testOneDocument() throws IOException {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig();
    iwc.setMergePolicy(newLogMergePolicy());
    IndexWriter writer = new IndexWriter(dir, iwc);
    Document doc = new Document();
    doc.add(new TextField("field", "foo bar", Field.Store.NO));
    writer.addDocument(doc);
    DirectoryReader ir = DirectoryReader.open(writer);
    writer.close();
    PostingsEnum p1 = getOnlyLeafReader(ir).postings(new Term("field", "foo"), PostingsEnum.POSITIONS);
    PostingsEnum p2 = getOnlyLeafReader(ir).postings(new Term("field", "bar"), PostingsEnum.POSITIONS);
    PostingsEnum union = new MultiPhraseQuery.UnionPostingsEnum(Arrays.asList(p1, p2));
    assertEquals(-1, union.docID());
    assertEquals(0, union.nextDoc());
    assertEquals(2, union.freq());
    assertEquals(0, union.nextPosition());
    assertEquals(1, union.nextPosition());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, union.nextDoc());
    ir.close();
    dir.close();
}
Also used : IndexWriter(org.apache.lucene.index.IndexWriter) DirectoryReader(org.apache.lucene.index.DirectoryReader) TextField(org.apache.lucene.document.TextField) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 15 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestMultiPhraseEnum method testSomeDocuments.

/** Tests union on a few documents  */
public void testSomeDocuments() throws IOException {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig();
    iwc.setMergePolicy(newLogMergePolicy());
    IndexWriter writer = new IndexWriter(dir, iwc);
    Document doc = new Document();
    doc.add(new TextField("field", "foo", Field.Store.NO));
    writer.addDocument(doc);
    writer.addDocument(new Document());
    doc = new Document();
    doc.add(new TextField("field", "foo bar", Field.Store.NO));
    writer.addDocument(doc);
    doc = new Document();
    doc.add(new TextField("field", "bar", Field.Store.NO));
    writer.addDocument(doc);
    writer.forceMerge(1);
    DirectoryReader ir = DirectoryReader.open(writer);
    writer.close();
    PostingsEnum p1 = getOnlyLeafReader(ir).postings(new Term("field", "foo"), PostingsEnum.POSITIONS);
    PostingsEnum p2 = getOnlyLeafReader(ir).postings(new Term("field", "bar"), PostingsEnum.POSITIONS);
    PostingsEnum union = new MultiPhraseQuery.UnionPostingsEnum(Arrays.asList(p1, p2));
    assertEquals(-1, union.docID());
    assertEquals(0, union.nextDoc());
    assertEquals(1, union.freq());
    assertEquals(0, union.nextPosition());
    assertEquals(2, union.nextDoc());
    assertEquals(2, union.freq());
    assertEquals(0, union.nextPosition());
    assertEquals(1, union.nextPosition());
    assertEquals(3, union.nextDoc());
    assertEquals(1, union.freq());
    assertEquals(0, union.nextPosition());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, union.nextDoc());
    ir.close();
    dir.close();
}
Also used : IndexWriter(org.apache.lucene.index.IndexWriter) DirectoryReader(org.apache.lucene.index.DirectoryReader) TextField(org.apache.lucene.document.TextField) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)73 BytesRef (org.apache.lucene.util.BytesRef)55 TermsEnum (org.apache.lucene.index.TermsEnum)50 Terms (org.apache.lucene.index.Terms)42 Fields (org.apache.lucene.index.Fields)18 LeafReader (org.apache.lucene.index.LeafReader)17 Term (org.apache.lucene.index.Term)16 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)15 Document (org.apache.lucene.document.Document)13 ArrayList (java.util.ArrayList)10 IndexReader (org.apache.lucene.index.IndexReader)10 TextField (org.apache.lucene.document.TextField)9 Directory (org.apache.lucene.store.Directory)9 Bits (org.apache.lucene.util.Bits)9 IOException (java.io.IOException)8 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)5