Search in sources :

Example 46 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestRTGBase method getFirstMatch.

protected int getFirstMatch(IndexReader r, Term t) throws IOException {
    Terms terms = MultiFields.getTerms(r, t.field());
    if (terms == null)
        return -1;
    BytesRef termBytes = t.bytes();
    final TermsEnum termsEnum = terms.iterator();
    if (!termsEnum.seekExact(termBytes)) {
        return -1;
    }
    PostingsEnum docs = termsEnum.postings(null, PostingsEnum.NONE);
    docs = BitsFilteredPostingsEnum.wrap(docs, MultiFields.getLiveDocs(r));
    int id = docs.nextDoc();
    if (id != DocIdSetIterator.NO_MORE_DOCS) {
        int next = docs.nextDoc();
        assertEquals(DocIdSetIterator.NO_MORE_DOCS, next);
    }
    return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id;
}
Also used : Terms(org.apache.lucene.index.Terms) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 47 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestPerfTasksLogic method testReadTokens.

/**
   * Test ReadTokensTask
   */
public void testReadTokens() throws Exception {
    // We will call ReadTokens on this many docs
    final int NUM_DOCS = 20;
    // Read tokens from first NUM_DOCS docs from Reuters and
    // then build index from the same docs
    String[] algLines1 = { "# ----- properties ", "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer", "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource", "docs.file=" + getReuters20LinesFile(), "# ----- alg ", "{ReadTokens}: " + NUM_DOCS, "ResetSystemErase", "CreateIndex", "{AddDoc}: " + NUM_DOCS, "CloseIndex" };
    // Run algo
    Benchmark benchmark = execBenchmark(algLines1);
    List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();
    // Count how many tokens all ReadTokens saw
    int totalTokenCount1 = 0;
    for (final TaskStats stat : stats) {
        if (stat.getTask().getName().equals("ReadTokens")) {
            totalTokenCount1 += stat.getCount();
        }
    }
    // Separately count how many tokens are actually in the index:
    IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
    assertEquals(NUM_DOCS, reader.numDocs());
    int totalTokenCount2 = 0;
    Fields fields = MultiFields.getFields(reader);
    for (String fieldName : fields) {
        if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
            continue;
        }
        Terms terms = fields.terms(fieldName);
        if (terms == null) {
            continue;
        }
        TermsEnum termsEnum = terms.iterator();
        PostingsEnum docs = null;
        while (termsEnum.next() != null) {
            docs = TestUtil.docs(random(), termsEnum, docs, PostingsEnum.FREQS);
            while (docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
                totalTokenCount2 += docs.freq();
            }
        }
    }
    reader.close();
    // Make sure they are the same
    assertEquals(totalTokenCount1, totalTokenCount2);
}
Also used : Fields(org.apache.lucene.index.Fields) MultiFields(org.apache.lucene.index.MultiFields) IndexReader(org.apache.lucene.index.IndexReader) Terms(org.apache.lucene.index.Terms) TaskStats(org.apache.lucene.benchmark.byTask.stats.TaskStats) PostingsEnum(org.apache.lucene.index.PostingsEnum) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 48 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestDocTermOrds method verify.

private void verify(LeafReader r, int[][] idToOrds, BytesRef[] termsArray, BytesRef prefixRef) throws Exception {
    final DocTermOrds dto = new DocTermOrds(r, r.getLiveDocs(), "field", prefixRef, Integer.MAX_VALUE, TestUtil.nextInt(random(), 2, 10));
    final NumericDocValues docIDToID = FieldCache.DEFAULT.getNumerics(r, "id", FieldCache.LEGACY_INT_PARSER);
    if (VERBOSE) {
        System.out.println("TEST: verify prefix=" + (prefixRef == null ? "null" : prefixRef.utf8ToString()));
        System.out.println("TEST: all TERMS:");
        TermsEnum allTE = MultiFields.getTerms(r, "field").iterator();
        int ord = 0;
        while (allTE.next() != null) {
            System.out.println("  ord=" + (ord++) + " term=" + allTE.term().utf8ToString());
        }
    }
    //final TermsEnum te = subR.fields().terms("field").iterator();
    final TermsEnum te = dto.getOrdTermsEnum(r);
    if (dto.numTerms() == 0) {
        if (prefixRef == null) {
            assertNull(MultiFields.getTerms(r, "field"));
        } else {
            Terms terms = MultiFields.getTerms(r, "field");
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator();
                TermsEnum.SeekStatus result = termsEnum.seekCeil(prefixRef);
                if (result != TermsEnum.SeekStatus.END) {
                    assertFalse("term=" + termsEnum.term().utf8ToString() + " matches prefix=" + prefixRef.utf8ToString(), StringHelper.startsWith(termsEnum.term(), prefixRef));
                } else {
                // ok
                }
            } else {
            // ok
            }
        }
        return;
    }
    if (VERBOSE) {
        System.out.println("TEST: TERMS:");
        te.seekExact(0);
        while (true) {
            System.out.println("  ord=" + te.ord() + " term=" + te.term().utf8ToString());
            if (te.next() == null) {
                break;
            }
        }
    }
    SortedSetDocValues iter = dto.iterator(r);
    for (int docID = 0; docID < r.maxDoc(); docID++) {
        assertEquals(docID, docIDToID.nextDoc());
        if (docID > iter.docID()) {
            iter.nextDoc();
        }
        if (docID < iter.docID()) {
            int[] answers = idToOrds[(int) docIDToID.longValue()];
            assertEquals(0, answers.length);
            continue;
        }
        if (VERBOSE) {
            System.out.println("TEST: docID=" + docID + " of " + r.maxDoc() + " (id=" + docIDToID.longValue() + ")");
        }
        final int[] answers = idToOrds[(int) docIDToID.longValue()];
        int upto = 0;
        long ord;
        while ((ord = iter.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
            te.seekExact(ord);
            final BytesRef expected = termsArray[answers[upto++]];
            if (VERBOSE) {
                System.out.println("  exp=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString());
            }
            assertEquals("expected=" + expected.utf8ToString() + " actual=" + te.term().utf8ToString() + " ord=" + ord, expected, te.term());
        }
        assertEquals(answers.length, upto);
    }
}
Also used : NumericDocValues(org.apache.lucene.index.NumericDocValues) SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) SeekStatus(org.apache.lucene.index.TermsEnum.SeekStatus) Terms(org.apache.lucene.index.Terms) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 49 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestFaceting method doTermEnum.

void doTermEnum(int size) throws Exception {
    //System.out.println("doTermEnum size=" + size);
    close();
    createIndex(size);
    req = lrf.makeRequest("q", "*:*");
    SortedSetDocValues dv = DocValues.getSortedSet(req.getSearcher().getSlowAtomicReader(), proto.field());
    assertEquals(size, dv.getValueCount());
    TermsEnum te = dv.termsEnum();
    Random r = new Random(size);
    // test seeking by term string
    for (int i = 0; i < size * 2 + 10; i++) {
        int rnum = r.nextInt(size + 2);
        String s = t(rnum);
        //System.out.println("s=" + s);
        final BytesRef br;
        if (te == null) {
            br = null;
        } else {
            TermsEnum.SeekStatus status = te.seekCeil(new BytesRef(s));
            if (status == TermsEnum.SeekStatus.END) {
                br = null;
            } else {
                br = te.term();
            }
        }
        assertEquals(br != null, rnum < size);
        if (rnum < size) {
            assertEquals(rnum, (int) te.ord());
            assertEquals(s, te.term().utf8ToString());
        }
    }
    // test seeking before term
    if (size > 0) {
        assertEquals(size > 0, te.seekCeil(new BytesRef("000")) != TermsEnum.SeekStatus.END);
        assertEquals(0, te.ord());
        assertEquals(t(0), te.term().utf8ToString());
    }
    if (size > 0) {
        // test seeking by term number
        for (int i = 0; i < size * 2 + 10; i++) {
            int rnum = r.nextInt(size);
            String s = t(rnum);
            te.seekExact((long) rnum);
            BytesRef br = te.term();
            assertNotNull(br);
            assertEquals(rnum, (int) te.ord());
            assertEquals(s, te.term().utf8ToString());
        }
    }
}
Also used : SortedSetDocValues(org.apache.lucene.index.SortedSetDocValues) Random(java.util.Random) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 50 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestTeeSinkTokenFilter method testEndOffsetPositionWithTeeSinkTokenFilter.

// LUCENE-1448
// TODO: instead of testing it this way, we can test 
// with BaseTokenStreamTestCase now...
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
    Directory dir = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer));
    Document doc = new Document();
    TokenStream tokenStream = analyzer.tokenStream("field", "abcd   ");
    TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream);
    TokenStream sink = tee.newSinkTokenStream();
    FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorOffsets(true);
    ft.setStoreTermVectorPositions(true);
    Field f1 = new Field("field", tee, ft);
    Field f2 = new Field("field", sink, ft);
    doc.add(f1);
    doc.add(f2);
    w.addDocument(doc);
    w.close();
    IndexReader r = DirectoryReader.open(dir);
    Terms vector = r.getTermVectors(0).terms("field");
    assertEquals(1, vector.size());
    TermsEnum termsEnum = vector.iterator();
    termsEnum.next();
    assertEquals(2, termsEnum.totalTermFreq());
    PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL);
    assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(2, positions.freq());
    positions.nextPosition();
    assertEquals(0, positions.startOffset());
    assertEquals(4, positions.endOffset());
    positions.nextPosition();
    assertEquals(8, positions.startOffset());
    assertEquals(12, positions.endOffset());
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc());
    r.close();
    dir.close();
    analyzer.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Terms(org.apache.lucene.index.Terms) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) FieldType(org.apache.lucene.document.FieldType) TermsEnum(org.apache.lucene.index.TermsEnum) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) PostingsEnum(org.apache.lucene.index.PostingsEnum) Directory(org.apache.lucene.store.Directory)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)155 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)103 PostingsEnum (org.apache.lucene.index.PostingsEnum)52 ArrayList (java.util.ArrayList)31 Term (org.apache.lucene.index.Term)31 IndexReader (org.apache.lucene.index.IndexReader)29 LeafReader (org.apache.lucene.index.LeafReader)28 IOException (java.io.IOException)26 Fields (org.apache.lucene.index.Fields)26 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10