Search in sources :

Example 41 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestMultiPhraseEnum method testOneDocument.

/** Tests union on one document  */
public void testOneDocument() throws IOException {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig();
    iwc.setMergePolicy(newLogMergePolicy());
    IndexWriter writer = new IndexWriter(dir, iwc);
    Document doc = new Document();
    doc.add(new TextField("field", "foo bar", Field.Store.NO));
    writer.addDocument(doc);
    DirectoryReader ir = DirectoryReader.open(writer);
    writer.close();
    PostingsEnum p1 = getOnlyLeafReader(ir).postings(new Term("field", "foo"), PostingsEnum.POSITIONS);
    PostingsEnum p2 = getOnlyLeafReader(ir).postings(new Term("field", "bar"), PostingsEnum.POSITIONS);
    PostingsEnum union = new MultiPhraseQuery.UnionPostingsEnum(Arrays.asList(p1, p2));
    assertEquals(-1, union.docID());
    assertEquals(0, union.nextDoc());
    assertEquals(2, union.freq());
    assertEquals(0, union.nextPosition());
    assertEquals(1, union.nextPosition());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, union.nextDoc());
    ir.close();
    dir.close();
}
Also used : IndexWriter(org.apache.lucene.index.IndexWriter) DirectoryReader(org.apache.lucene.index.DirectoryReader) TextField(org.apache.lucene.document.TextField) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 42 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestMultiPhraseEnum method testSomeDocuments.

/** Tests union on a few documents  */
public void testSomeDocuments() throws IOException {
    Directory dir = newDirectory();
    IndexWriterConfig iwc = newIndexWriterConfig();
    iwc.setMergePolicy(newLogMergePolicy());
    IndexWriter writer = new IndexWriter(dir, iwc);
    Document doc = new Document();
    doc.add(new TextField("field", "foo", Field.Store.NO));
    writer.addDocument(doc);
    writer.addDocument(new Document());
    doc = new Document();
    doc.add(new TextField("field", "foo bar", Field.Store.NO));
    writer.addDocument(doc);
    doc = new Document();
    doc.add(new TextField("field", "bar", Field.Store.NO));
    writer.addDocument(doc);
    writer.forceMerge(1);
    DirectoryReader ir = DirectoryReader.open(writer);
    writer.close();
    PostingsEnum p1 = getOnlyLeafReader(ir).postings(new Term("field", "foo"), PostingsEnum.POSITIONS);
    PostingsEnum p2 = getOnlyLeafReader(ir).postings(new Term("field", "bar"), PostingsEnum.POSITIONS);
    PostingsEnum union = new MultiPhraseQuery.UnionPostingsEnum(Arrays.asList(p1, p2));
    assertEquals(-1, union.docID());
    assertEquals(0, union.nextDoc());
    assertEquals(1, union.freq());
    assertEquals(0, union.nextPosition());
    assertEquals(2, union.nextDoc());
    assertEquals(2, union.freq());
    assertEquals(0, union.nextPosition());
    assertEquals(1, union.nextPosition());
    assertEquals(3, union.nextDoc());
    assertEquals(1, union.freq());
    assertEquals(0, union.nextPosition());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, union.nextDoc());
    ir.close();
    dir.close();
}
Also used : IndexWriter(org.apache.lucene.index.IndexWriter) DirectoryReader(org.apache.lucene.index.DirectoryReader) TextField(org.apache.lucene.document.TextField) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) Directory(org.apache.lucene.store.Directory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 43 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestCachingTokenFilter method testCaching.

public void testCaching() throws IOException {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    AtomicInteger resetCount = new AtomicInteger(0);
    TokenStream stream = new TokenStream() {

        private int index = 0;

        private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

        private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

        @Override
        public void reset() throws IOException {
            super.reset();
            resetCount.incrementAndGet();
        }

        @Override
        public boolean incrementToken() {
            if (index == tokens.length) {
                return false;
            } else {
                clearAttributes();
                termAtt.append(tokens[index++]);
                offsetAtt.setOffset(0, 0);
                return true;
            }
        }
    };
    stream = new CachingTokenFilter(stream);
    doc.add(new TextField("preanalyzed", stream));
    // 1) we consume all tokens twice before we add the doc to the index
    assertFalse(((CachingTokenFilter) stream).isCached());
    stream.reset();
    assertFalse(((CachingTokenFilter) stream).isCached());
    checkTokens(stream);
    stream.reset();
    checkTokens(stream);
    assertTrue(((CachingTokenFilter) stream).isCached());
    // 2) now add the document to the index and verify if all tokens are indexed
    //    don't reset the stream here, the DocumentWriter should do that implicitly
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term1"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(1, termPositions.freq());
    assertEquals(0, termPositions.nextPosition());
    termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term2"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(2, termPositions.freq());
    assertEquals(1, termPositions.nextPosition());
    assertEquals(3, termPositions.nextPosition());
    termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term3"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(1, termPositions.freq());
    assertEquals(2, termPositions.nextPosition());
    reader.close();
    writer.close();
    // 3) reset stream and consume tokens again
    stream.reset();
    checkTokens(stream);
    assertEquals(1, resetCount.get());
    dir.close();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 44 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestPositionIncrement method testSetPosition.

public void testSetPosition() throws Exception {
    Analyzer analyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new Tokenizer() {

                // TODO: use CannedTokenStream
                private final String[] TOKENS = { "1", "2", "3", "4", "5" };

                private final int[] INCREMENTS = { 1, 2, 1, 0, 1 };

                private int i = 0;

                PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

                CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

                OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

                @Override
                public boolean incrementToken() {
                    if (i == TOKENS.length)
                        return false;
                    clearAttributes();
                    termAtt.append(TOKENS[i]);
                    offsetAtt.setOffset(i, i);
                    posIncrAtt.setPositionIncrement(INCREMENTS[i]);
                    i++;
                    return true;
                }

                @Override
                public void reset() throws IOException {
                    super.reset();
                    this.i = 0;
                }
            });
        }
    };
    Directory store = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), store, analyzer);
    Document d = new Document();
    d.add(newTextField("field", "bogus", Field.Store.YES));
    writer.addDocument(d);
    IndexReader reader = writer.getReader();
    writer.close();
    IndexSearcher searcher = newSearcher(reader);
    PostingsEnum pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("1"));
    pos.nextDoc();
    // first token should be at position 0
    assertEquals(0, pos.nextPosition());
    pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("2"));
    pos.nextDoc();
    // second token should be at position 2
    assertEquals(2, pos.nextPosition());
    PhraseQuery q;
    ScoreDoc[] hits;
    q = new PhraseQuery("field", "1", "2");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // same as previous, using the builder with implicit positions
    PhraseQuery.Builder builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"));
    builder.add(new Term("field", "2"));
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // same as previous, just specify positions explicitely.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"), 0);
    builder.add(new Term("field", "2"), 1);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // specifying correct positions should find the phrase.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"), 0);
    builder.add(new Term("field", "2"), 2);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "3");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "3", "4");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // phrase query would find it when correct positions are specified. 
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "3"), 0);
    builder.add(new Term("field", "4"), 0);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    // phrase query should fail for non existing searched term 
    // even if there exist another searched terms in the same searched position. 
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "3"), 0);
    builder.add(new Term("field", "9"), 0);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // multi-phrase query should succed for non existing searched term
    // because there exist another searched terms in the same searched position. 
    MultiPhraseQuery.Builder mqb = new MultiPhraseQuery.Builder();
    mqb.add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0);
    hits = searcher.search(mqb.build(), 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "4");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "3", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "4", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    reader.close();
    store.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) MockPayloadAnalyzer(org.apache.lucene.analysis.MockPayloadAnalyzer) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) Tokenizer(org.apache.lucene.analysis.Tokenizer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter)

Example 45 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project textdb by TextDB.

the class DataReader method buildPayloadFromTermVector.

private ArrayList<Span> buildPayloadFromTermVector(List<IField> fields, int docID) throws IOException {
    ArrayList<Span> payloadSpanList = new ArrayList<>();
    for (Attribute attr : inputSchema.getAttributes()) {
        String attributeName = attr.getName();
        AttributeType attributeType = attr.getType();
        // payload.
        if (attributeType != AttributeType.TEXT) {
            continue;
        }
        String fieldValue = fields.get(inputSchema.getIndex(attributeName)).getValue().toString();
        Terms termVector = luceneIndexReader.getTermVector(docID, attributeName);
        if (termVector == null) {
            continue;
        }
        TermsEnum termsEnum = termVector.iterator();
        PostingsEnum termPostings = null;
        // go through document terms
        while ((termsEnum.next()) != null) {
            termPostings = termsEnum.postings(termPostings, PostingsEnum.ALL);
            if (termPostings.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
                continue;
            }
            // for each term, go through its postings
            for (int i = 0; i < termPostings.freq(); i++) {
                // nextPosition needs to be called first
                int tokenPosition = termPostings.nextPosition();
                int charStart = termPostings.startOffset();
                int charEnd = termPostings.endOffset();
                String analyzedTermStr = termsEnum.term().utf8ToString();
                String originalTermStr = fieldValue.substring(charStart, charEnd);
                Span span = new Span(attributeName, charStart, charEnd, analyzedTermStr, originalTermStr, tokenPosition);
                payloadSpanList.add(span);
            }
        }
    }
    return payloadSpanList;
}
Also used : Attribute(edu.uci.ics.texera.api.schema.Attribute) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) ArrayList(java.util.ArrayList) Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) Span(edu.uci.ics.texera.api.span.Span) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)74 BytesRef (org.apache.lucene.util.BytesRef)55 TermsEnum (org.apache.lucene.index.TermsEnum)51 Terms (org.apache.lucene.index.Terms)43 Fields (org.apache.lucene.index.Fields)18 LeafReader (org.apache.lucene.index.LeafReader)17 Term (org.apache.lucene.index.Term)16 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)15 Document (org.apache.lucene.document.Document)13 ArrayList (java.util.ArrayList)11 IndexReader (org.apache.lucene.index.IndexReader)10 TextField (org.apache.lucene.document.TextField)9 Directory (org.apache.lucene.store.Directory)9 Bits (org.apache.lucene.util.Bits)9 IOException (java.io.IOException)8 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)5