Search in sources :

Example 16 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestCachingTokenFilter method testCaching.

public void testCaching() throws IOException {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    AtomicInteger resetCount = new AtomicInteger(0);
    TokenStream stream = new TokenStream() {

        private int index = 0;

        private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

        private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

        @Override
        public void reset() throws IOException {
            super.reset();
            resetCount.incrementAndGet();
        }

        @Override
        public boolean incrementToken() {
            if (index == tokens.length) {
                return false;
            } else {
                clearAttributes();
                termAtt.append(tokens[index++]);
                offsetAtt.setOffset(0, 0);
                return true;
            }
        }
    };
    stream = new CachingTokenFilter(stream);
    doc.add(new TextField("preanalyzed", stream));
    // 1) we consume all tokens twice before we add the doc to the index
    assertFalse(((CachingTokenFilter) stream).isCached());
    stream.reset();
    assertFalse(((CachingTokenFilter) stream).isCached());
    checkTokens(stream);
    stream.reset();
    checkTokens(stream);
    assertTrue(((CachingTokenFilter) stream).isCached());
    // 2) now add the document to the index and verify if all tokens are indexed
    //    don't reset the stream here, the DocumentWriter should do that implicitly
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term1"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(1, termPositions.freq());
    assertEquals(0, termPositions.nextPosition());
    termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term2"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(2, termPositions.freq());
    assertEquals(1, termPositions.nextPosition());
    assertEquals(3, termPositions.nextPosition());
    termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term3"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(1, termPositions.freq());
    assertEquals(2, termPositions.nextPosition());
    reader.close();
    writer.close();
    // 3) reset stream and consume tokens again
    stream.reset();
    checkTokens(stream);
    assertEquals(1, resetCount.get());
    dir.close();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 17 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestPositionIncrement method testSetPosition.

public void testSetPosition() throws Exception {
    Analyzer analyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new Tokenizer() {

                // TODO: use CannedTokenStream
                private final String[] TOKENS = { "1", "2", "3", "4", "5" };

                private final int[] INCREMENTS = { 1, 2, 1, 0, 1 };

                private int i = 0;

                PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

                CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

                OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

                @Override
                public boolean incrementToken() {
                    if (i == TOKENS.length)
                        return false;
                    clearAttributes();
                    termAtt.append(TOKENS[i]);
                    offsetAtt.setOffset(i, i);
                    posIncrAtt.setPositionIncrement(INCREMENTS[i]);
                    i++;
                    return true;
                }

                @Override
                public void reset() throws IOException {
                    super.reset();
                    this.i = 0;
                }
            });
        }
    };
    Directory store = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), store, analyzer);
    Document d = new Document();
    d.add(newTextField("field", "bogus", Field.Store.YES));
    writer.addDocument(d);
    IndexReader reader = writer.getReader();
    writer.close();
    IndexSearcher searcher = newSearcher(reader);
    PostingsEnum pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("1"));
    pos.nextDoc();
    // first token should be at position 0
    assertEquals(0, pos.nextPosition());
    pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("2"));
    pos.nextDoc();
    // second token should be at position 2
    assertEquals(2, pos.nextPosition());
    PhraseQuery q;
    ScoreDoc[] hits;
    q = new PhraseQuery("field", "1", "2");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // same as previous, using the builder with implicit positions
    PhraseQuery.Builder builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"));
    builder.add(new Term("field", "2"));
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // same as previous, just specify positions explicitely.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"), 0);
    builder.add(new Term("field", "2"), 1);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // specifying correct positions should find the phrase.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"), 0);
    builder.add(new Term("field", "2"), 2);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "3");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "3", "4");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // phrase query would find it when correct positions are specified. 
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "3"), 0);
    builder.add(new Term("field", "4"), 0);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    // phrase query should fail for non existing searched term 
    // even if there exist another searched terms in the same searched position. 
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "3"), 0);
    builder.add(new Term("field", "9"), 0);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // multi-phrase query should succed for non existing searched term
    // because there exist another searched terms in the same searched position. 
    MultiPhraseQuery.Builder mqb = new MultiPhraseQuery.Builder();
    mqb.add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0);
    hits = searcher.search(mqb.build(), 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "4");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "3", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "4", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    reader.close();
    store.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) MockPayloadAnalyzer(org.apache.lucene.analysis.MockPayloadAnalyzer) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) Tokenizer(org.apache.lucene.analysis.Tokenizer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter)

Example 18 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestKeywordAnalyzer method testMutipleDocument.

/*
  public void testPerFieldAnalyzer() throws Exception {
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer());
    analyzer.addAnalyzer("partnum", new KeywordAnalyzer());

    QueryParser queryParser = new QueryParser("description", analyzer);
    Query query = queryParser.parse("partnum:Q36 AND SPACE");

    ScoreDoc[] hits = searcher.search(query, null, 1000).scoreDocs;
    assertEquals("Q36 kept as-is",
              "+partnum:Q36 +space", query.toString("description"));
    assertEquals("doc found!", 1, hits.length);
  }
  */
public void testMutipleDocument() throws Exception {
    RAMDirectory dir = new RAMDirectory();
    Analyzer analyzer = new KeywordAnalyzer();
    IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer));
    Document doc = new Document();
    doc.add(new TextField("partnum", "Q36", Field.Store.YES));
    writer.addDocument(doc);
    doc = new Document();
    doc.add(new TextField("partnum", "Q37", Field.Store.YES));
    writer.addDocument(doc);
    writer.close();
    IndexReader reader = DirectoryReader.open(dir);
    PostingsEnum td = TestUtil.docs(random(), reader, "partnum", new BytesRef("Q36"), null, 0);
    assertTrue(td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    td = TestUtil.docs(random(), reader, "partnum", new BytesRef("Q37"), null, 0);
    assertTrue(td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    analyzer.close();
}
Also used : IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) RAMDirectory(org.apache.lucene.store.RAMDirectory) BytesRef(org.apache.lucene.util.BytesRef) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 19 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestTeeSinkTokenFilter method testEndOffsetPositionWithTeeSinkTokenFilter.

// LUCENE-1448
// TODO: instead of testing it this way, we can test 
// with BaseTokenStreamTestCase now...
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
    Directory dir = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
    Document doc = new Document();
    TokenStream tokenStream = analyzer.tokenStream("field", "abcd   ");
    TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
    TokenStream sink = tee.newSinkTokenStream();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorOffsets(true);
    ft.setStoreTermVectorPositions(true);
    Field f1 = new Field("field", tee, ft);
    Field f2 = new Field("field", sink, ft);
    doc.add(f1);
    doc.add(f2);
    w.addDocument(doc);
    w.close();
    IndexReader r = DirectoryReader.open(dir);
    Terms vector = r.getTermVectors(0).terms("field");
    assertEquals(1, vector.size());
    TermsEnum termsEnum = vector.iterator();
    termsEnum.next();
    assertEquals(2, termsEnum.totalTermFreq());
    PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
    assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(2, positions.freq());
    positions.nextPosition();
    assertEquals(0, positions.startOffset());
    assertEquals(4, positions.endOffset());
    positions.nextPosition();
    assertEquals(8, positions.startOffset());
    assertEquals(12, positions.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
    r.close();
    dir.close();
    analyzer.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Terms(org.apache.lucene.index.Terms) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TermsEnum(org.apache.lucene.index.TermsEnum) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) PostingsEnum(org.apache.lucene.index.PostingsEnum) Directory(org.apache.lucene.store.Directory)

Example 20 with PostingsEnum

use of org.apache.lucene.index.PostingsEnum in project lucene-solr by apache.

the class TestPerfTasksLogic method testReadTokens.

/**
   * Test ReadTokensTask
   */
public void testReadTokens() throws Exception {
    // We will call ReadTokens on this many docs
    final int NUM_DOCS = 20;
    // Read tokens from first NUM_DOCS docs from Reuters and
    // then build index from the same docs
    String[] algLines1 = { "# ----- properties ", "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "# ----- alg ", "{ReadTokens}: " + NUM_DOCS, "ResetSystemErase", "CreateIndex", "{AddDoc}: " + NUM_DOCS, "CloseIndex" };
    // Run algo
    Benchmark benchmark = execBenchmark(algLines1);
    List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();
    // Count how many tokens all ReadTokens saw
    int totalTokenCount1 = 0;
    for (final TaskStats stat : stats) {
        if (stat.getTask().getName().equals("ReadTokens")) {
            totalTokenCount1 += stat.getCount();
        }
    }
    // Separately count how many tokens are actually in the index:
    IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
    assertEquals(NUM_DOCS, reader.numDocs());
    int totalTokenCount2 = 0;
    Fields fields = MultiFields.getFields(reader);
    for (String fieldName : fields) {
        if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
            continue;
        }
        Terms terms = fields.terms(fieldName);
        if (terms == null) {
            continue;
        }
        TermsEnum termsEnum = terms.iterator();
        PostingsEnum docs = null;
        while (termsEnum.next() != null) {
            docs = TestUtil.docs(random(), termsEnum, docs, PostingsEnum.FREQS);
            while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                totalTokenCount2 += docs.freq();
            }
        }
    }
    reader.close();
    // Make sure they are the same
    assertEquals(totalTokenCount1, totalTokenCount2);
}
Also used : Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) IndexReader(org.apache.lucene.index.IndexReader) Terms(org.apache.lucene.index.Terms) TaskStats(org.apache.lucene.benchmark.byTask.stats.TaskStats) PostingsEnum(org.apache.lucene.index.PostingsEnum) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

PostingsEnum (org.apache.lucene.index.PostingsEnum)73 BytesRef (org.apache.lucene.util.BytesRef)55 TermsEnum (org.apache.lucene.index.TermsEnum)50 Terms (org.apache.lucene.index.Terms)42 Fields (org.apache.lucene.index.Fields)18 LeafReader (org.apache.lucene.index.LeafReader)17 Term (org.apache.lucene.index.Term)16 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)15 Document (org.apache.lucene.document.Document)13 ArrayList (java.util.ArrayList)10 IndexReader (org.apache.lucene.index.IndexReader)10 TextField (org.apache.lucene.document.TextField)9 Directory (org.apache.lucene.store.Directory)9 Bits (org.apache.lucene.util.Bits)9 IOException (java.io.IOException)8 DirectoryReader (org.apache.lucene.index.DirectoryReader)7 IndexWriter (org.apache.lucene.index.IndexWriter)6 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)6 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 XContentBuilder (org.elasticsearch.common.xcontent.XContentBuilder)5