Search in sources :

Example 41 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class LukeRequestHandler method getDetailedFieldInfo.

// Get terribly detailed information about a particular field. This is a very expensive call, use it with caution
// especially on large indexes!
@SuppressWarnings("unchecked")
private static void getDetailedFieldInfo(SolrQueryRequest req, String field, SimpleOrderedMap<Object> fieldMap) throws IOException {
    SolrParams params = req.getParams();
    final int numTerms = params.getInt(NUMTERMS, DEFAULT_COUNT);
    // Something to collect the top N terms in.
    TopTermQueue tiq = new TopTermQueue(numTerms + 1);
    final CharsRefBuilder spare = new CharsRefBuilder();
    Terms terms = MultiFields.getTerms(req.getSearcher().getIndexReader(), field);
    if (terms == null) {
        // field does not exist
        return;
    }
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    int[] buckets = new int[HIST_ARRAY_SIZE];
    while ((text = termsEnum.next()) != null) {
        ++tiq.distinctTerms;
        // This calculation seems odd, but it gives the same results as it used to.
        int freq = termsEnum.docFreq();
        int slot = 32 - Integer.numberOfLeadingZeros(Math.max(0, freq - 1));
        buckets[slot] = buckets[slot] + 1;
        if (numTerms > 0 && freq > tiq.minFreq) {
            spare.copyUTF8Bytes(text);
            String t = spare.toString();
            tiq.add(new TopTermQueue.TermInfo(new Term(field, t), termsEnum.docFreq()));
            if (tiq.size() > numTerms) {
                // if tiq full
                // remove lowest in tiq
                tiq.pop();
                tiq.minFreq = tiq.getTopTermInfo().docFreq;
            }
        }
    }
    tiq.histogram.add(buckets);
    fieldMap.add("distinct", tiq.distinctTerms);
    // Include top terms
    fieldMap.add("topTerms", tiq.toNamedList(req.getSearcher().getSchema()));
    // Add a histogram
    fieldMap.add("histogram", tiq.histogram.toNamedList());
}
Also used : Terms(org.apache.lucene.index.Terms) SolrParams(org.apache.solr.common.params.SolrParams) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Term(org.apache.lucene.index.Term) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 42 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class LukeRequestHandler method getFirstLiveDoc.

// Just get a document with the term in it, the first one will do!
// Is there a better way to do this? Shouldn't actually be very costly
// to do it this way.
private static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException {
    PostingsEnum postingsEnum = null;
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    // Deal with the chance that the first bunch of terms are in deleted documents. Is there a better way?
    for (int idx = 0; idx < 1000 && postingsEnum == null; ++idx) {
        text = termsEnum.next();
        if (text == null) {
            // Ran off the end of the terms enum without finding any live docs with that field in them.
            return null;
        }
        postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
        final Bits liveDocs = reader.getLiveDocs();
        if (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            if (liveDocs != null && liveDocs.get(postingsEnum.docID())) {
                continue;
            }
            return reader.document(postingsEnum.docID());
        }
    }
    return null;
}
Also used : Bits(org.apache.lucene.util.Bits) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 43 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestFSTs method testPrimaryKeys.

public void testPrimaryKeys() throws Exception {
    Directory dir = newDirectory();
    for (int cycle = 0; cycle < 2; cycle++) {
        if (VERBOSE) {
            System.out.println("TEST: cycle=" + cycle);
        }
        RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
        Document doc = new Document();
        Field idField = newStringField("id", "", Field.Store.NO);
        doc.add(idField);
        final int NUM_IDS = atLeast(200);
        //final int NUM_IDS = (int) (377 * (1.0+random.nextDouble()));
        if (VERBOSE) {
            System.out.println("TEST: NUM_IDS=" + NUM_IDS);
        }
        final Set<String> allIDs = new HashSet<>();
        for (int id = 0; id < NUM_IDS; id++) {
            String idString;
            if (cycle == 0) {
                // PKs are assigned sequentially
                idString = String.format(Locale.ROOT, "%07d", id);
            } else {
                while (true) {
                    final String s = Long.toString(random().nextLong());
                    if (!allIDs.contains(s)) {
                        idString = s;
                        break;
                    }
                }
            }
            allIDs.add(idString);
            idField.setStringValue(idString);
            w.addDocument(doc);
        }
        //w.forceMerge(1);
        // turn writer into reader:
        final IndexReader r = w.getReader();
        final IndexSearcher s = newSearcher(r);
        w.close();
        final List<String> allIDsList = new ArrayList<>(allIDs);
        final List<String> sortedAllIDsList = new ArrayList<>(allIDsList);
        Collections.sort(sortedAllIDsList);
        // Sprinkle in some non-existent PKs:
        Set<String> outOfBounds = new HashSet<>();
        for (int idx = 0; idx < NUM_IDS / 10; idx++) {
            String idString;
            if (cycle == 0) {
                idString = String.format(Locale.ROOT, "%07d", (NUM_IDS + idx));
            } else {
                while (true) {
                    idString = Long.toString(random().nextLong());
                    if (!allIDs.contains(idString)) {
                        break;
                    }
                }
            }
            outOfBounds.add(idString);
            allIDsList.add(idString);
        }
        // Verify w/ TermQuery
        for (int iter = 0; iter < 2 * NUM_IDS; iter++) {
            final String id = allIDsList.get(random().nextInt(allIDsList.size()));
            final boolean exists = !outOfBounds.contains(id);
            if (VERBOSE) {
                System.out.println("TEST: TermQuery " + (exists ? "" : "non-exist ") + " id=" + id);
            }
            assertEquals((exists ? "" : "non-exist ") + "id=" + id, exists ? 1 : 0, s.search(new TermQuery(new Term("id", id)), 1).totalHits);
        }
        // Verify w/ MultiTermsEnum
        final TermsEnum termsEnum = MultiFields.getTerms(r, "id").iterator();
        for (int iter = 0; iter < 2 * NUM_IDS; iter++) {
            final String id;
            final String nextID;
            final boolean exists;
            if (random().nextBoolean()) {
                id = allIDsList.get(random().nextInt(allIDsList.size()));
                exists = !outOfBounds.contains(id);
                nextID = null;
                if (VERBOSE) {
                    System.out.println("TEST: exactOnly " + (exists ? "" : "non-exist ") + "id=" + id);
                }
            } else {
                // Pick ID between two IDs:
                exists = false;
                final int idv = random().nextInt(NUM_IDS - 1);
                if (cycle == 0) {
                    id = String.format(Locale.ROOT, "%07da", idv);
                    nextID = String.format(Locale.ROOT, "%07d", idv + 1);
                } else {
                    id = sortedAllIDsList.get(idv) + "a";
                    nextID = sortedAllIDsList.get(idv + 1);
                }
                if (VERBOSE) {
                    System.out.println("TEST: not exactOnly id=" + id + " nextID=" + nextID);
                }
            }
            final TermsEnum.SeekStatus status;
            if (nextID == null) {
                if (termsEnum.seekExact(new BytesRef(id))) {
                    status = TermsEnum.SeekStatus.FOUND;
                } else {
                    status = TermsEnum.SeekStatus.NOT_FOUND;
                }
            } else {
                status = termsEnum.seekCeil(new BytesRef(id));
            }
            if (nextID != null) {
                assertEquals(TermsEnum.SeekStatus.NOT_FOUND, status);
                assertEquals("expected=" + nextID + " actual=" + termsEnum.term().utf8ToString(), new BytesRef(nextID), termsEnum.term());
            } else if (!exists) {
                assertTrue(status == TermsEnum.SeekStatus.NOT_FOUND || status == TermsEnum.SeekStatus.END);
            } else {
                assertEquals(TermsEnum.SeekStatus.FOUND, status);
            }
        }
        r.close();
    }
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TermQuery(org.apache.lucene.search.TermQuery) ArrayList(java.util.ArrayList) FSTTester.simpleRandomString(org.apache.lucene.util.fst.FSTTester.simpleRandomString) FSTTester.getRandomString(org.apache.lucene.util.fst.FSTTester.getRandomString) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) TermsEnum(org.apache.lucene.index.TermsEnum) Field(org.apache.lucene.document.Field) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) HashSet(java.util.HashSet)

Example 44 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestFSTs method testRealTerms.

// Build FST for all unique terms in the test line docs
// file, up until a doc limit
public void testRealTerms() throws Exception {
    final LineFileDocs docs = new LineFileDocs(random());
    final int numDocs = TEST_NIGHTLY ? atLeast(1000) : atLeast(100);
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    final IndexWriterConfig conf = newIndexWriterConfig(analyzer).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
    final Path tempDir = createTempDir("fstlines");
    final Directory dir = newFSDirectory(tempDir);
    final IndexWriter writer = new IndexWriter(dir, conf);
    Document doc;
    int docCount = 0;
    while ((doc = docs.nextDoc()) != null && docCount < numDocs) {
        writer.addDocument(doc);
        docCount++;
    }
    IndexReader r = DirectoryReader.open(writer);
    writer.close();
    final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
    Builder<Long> builder = new Builder<>(FST.INPUT_TYPE.BYTE1, 0, 0, true, true, Integer.MAX_VALUE, outputs, true, 15);
    boolean storeOrd = random().nextBoolean();
    if (VERBOSE) {
        if (storeOrd) {
            System.out.println("FST stores ord");
        } else {
            System.out.println("FST stores docFreq");
        }
    }
    Terms terms = MultiFields.getTerms(r, "body");
    if (terms != null) {
        final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
        final TermsEnum termsEnum = terms.iterator();
        if (VERBOSE) {
            System.out.println("TEST: got termsEnum=" + termsEnum);
        }
        BytesRef term;
        int ord = 0;
        Automaton automaton = new RegExp(".*", RegExp.NONE).toAutomaton();
        final TermsEnum termsEnum2 = terms.intersect(new CompiledAutomaton(automaton, false, false), null);
        while ((term = termsEnum.next()) != null) {
            BytesRef term2 = termsEnum2.next();
            assertNotNull(term2);
            assertEquals(term, term2);
            assertEquals(termsEnum.docFreq(), termsEnum2.docFreq());
            assertEquals(termsEnum.totalTermFreq(), termsEnum2.totalTermFreq());
            if (ord == 0) {
                try {
                    termsEnum.ord();
                } catch (UnsupportedOperationException uoe) {
                    if (VERBOSE) {
                        System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
                    }
                    storeOrd = false;
                }
            }
            final int output;
            if (storeOrd) {
                output = ord;
            } else {
                output = termsEnum.docFreq();
            }
            builder.add(Util.toIntsRef(term, scratchIntsRef), (long) output);
            ord++;
            if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
                System.out.println(ord + " terms...");
            }
        }
        FST<Long> fst = builder.finish();
        if (VERBOSE) {
            System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + builder.getNodeCount() + " nodes; " + builder.getArcCount() + " arcs;" + " " + fst.ramBytesUsed() + " bytes");
        }
        if (ord > 0) {
            final Random random = new Random(random().nextLong());
            // Now confirm BytesRefFSTEnum and TermsEnum act the
            // same:
            final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
            int num = atLeast(1000);
            for (int iter = 0; iter < num; iter++) {
                final BytesRef randomTerm = new BytesRef(getRandomString(random));
                if (VERBOSE) {
                    System.out.println("TEST: seek non-exist " + randomTerm.utf8ToString() + " " + randomTerm);
                }
                final TermsEnum.SeekStatus seekResult = termsEnum.seekCeil(randomTerm);
                final InputOutput<Long> fstSeekResult = fstEnum.seekCeil(randomTerm);
                if (seekResult == TermsEnum.SeekStatus.END) {
                    assertNull("got " + (fstSeekResult == null ? "null" : fstSeekResult.input.utf8ToString()) + " but expected null", fstSeekResult);
                } else {
                    assertSame(termsEnum, fstEnum, storeOrd);
                    for (int nextIter = 0; nextIter < 10; nextIter++) {
                        if (VERBOSE) {
                            System.out.println("TEST: next");
                            if (storeOrd) {
                                System.out.println("  ord=" + termsEnum.ord());
                            }
                        }
                        if (termsEnum.next() != null) {
                            if (VERBOSE) {
                                System.out.println("  term=" + termsEnum.term().utf8ToString());
                            }
                            assertNotNull(fstEnum.next());
                            assertSame(termsEnum, fstEnum, storeOrd);
                        } else {
                            if (VERBOSE) {
                                System.out.println("  end!");
                            }
                            BytesRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
                            if (nextResult != null) {
                                System.out.println("expected null but got: input=" + nextResult.input.utf8ToString() + " output=" + outputs.outputToString(nextResult.output));
                                fail();
                            }
                            break;
                        }
                    }
                }
            }
        }
    }
    r.close();
    dir.close();
}
Also used : BytesRefBuilder(org.apache.lucene.util.BytesRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Document(org.apache.lucene.document.Document) TermsEnum(org.apache.lucene.index.TermsEnum) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Random(java.util.Random) BytesRef(org.apache.lucene.util.BytesRef) LineFileDocs(org.apache.lucene.util.LineFileDocs) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) Path(java.nio.file.Path) CompiledAutomaton(org.apache.lucene.util.automaton.CompiledAutomaton) Automaton(org.apache.lucene.util.automaton.Automaton) RegExp(org.apache.lucene.util.automaton.RegExp) Terms(org.apache.lucene.index.Terms) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IndexWriter(org.apache.lucene.index.IndexWriter) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) IndexReader(org.apache.lucene.index.IndexReader) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 45 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TermsIncludingScoreQuery method createWeight.

@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores, float boost) throws IOException {
    if (needsScores == false) {
        // We don't need scores then quickly change the query:
        TermsQuery termsQuery = new TermsQuery(toField, terms, fromField, fromQuery, topReaderContextId);
        return searcher.rewrite(termsQuery).createWeight(searcher, false, boost);
    }
    return new Weight(TermsIncludingScoreQuery.this) {

        @Override
        public void extractTerms(Set<Term> terms) {
        }

        @Override
        public Explanation explain(LeafReaderContext context, int doc) throws IOException {
            Terms terms = context.reader().terms(toField);
            if (terms != null) {
                TermsEnum segmentTermsEnum = terms.iterator();
                BytesRef spare = new BytesRef();
                PostingsEnum postingsEnum = null;
                for (int i = 0; i < TermsIncludingScoreQuery.this.terms.size(); i++) {
                    if (segmentTermsEnum.seekExact(TermsIncludingScoreQuery.this.terms.get(ords[i], spare))) {
                        postingsEnum = segmentTermsEnum.postings(postingsEnum, PostingsEnum.NONE);
                        if (postingsEnum.advance(doc) == doc) {
                            final float score = TermsIncludingScoreQuery.this.scores[ords[i]];
                            return Explanation.match(score, "Score based on join value " + segmentTermsEnum.term().utf8ToString());
                        }
                    }
                }
            }
            return Explanation.noMatch("Not a match");
        }

        @Override
        public Scorer scorer(LeafReaderContext context) throws IOException {
            Terms terms = context.reader().terms(toField);
            if (terms == null) {
                return null;
            }
            // what is the runtime...seems ok?
            final long cost = context.reader().maxDoc() * terms.size();
            TermsEnum segmentTermsEnum = terms.iterator();
            if (multipleValuesPerDocument) {
                return new MVInOrderScorer(this, segmentTermsEnum, context.reader().maxDoc(), cost);
            } else {
                return new SVInOrderScorer(this, segmentTermsEnum, context.reader().maxDoc(), cost);
            }
        }
    };
}
Also used : Set(java.util.Set) FixedBitSet(org.apache.lucene.util.FixedBitSet) Terms(org.apache.lucene.index.Terms) Weight(org.apache.lucene.search.Weight) TermsEnum(org.apache.lucene.index.TermsEnum) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) PostingsEnum(org.apache.lucene.index.PostingsEnum) BytesRef(org.apache.lucene.util.BytesRef)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)155 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)103 PostingsEnum (org.apache.lucene.index.PostingsEnum)52 ArrayList (java.util.ArrayList)31 Term (org.apache.lucene.index.Term)31 IndexReader (org.apache.lucene.index.IndexReader)29 LeafReader (org.apache.lucene.index.LeafReader)28 IOException (java.io.IOException)26 Fields (org.apache.lucene.index.Fields)26 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10