Search in sources :

Example 71 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TestShardSearching method testSimple.

public void testSimple() throws Exception {
    final int numNodes = TestUtil.nextInt(random(), 1, 10);
    final double runTimeSec = atLeast(3);
    final int minDocsToMakeTerms = TestUtil.nextInt(random(), 5, 20);
    final int maxSearcherAgeSeconds = TestUtil.nextInt(random(), 1, 3);
    if (VERBOSE) {
        System.out.println("TEST: numNodes=" + numNodes + " runTimeSec=" + runTimeSec + " maxSearcherAgeSeconds=" + maxSearcherAgeSeconds);
    }
    start(numNodes, runTimeSec, maxSearcherAgeSeconds);
    final List<PreviousSearchState> priorSearches = new ArrayList<>();
    List<BytesRef> terms = null;
    while (System.nanoTime() < endTimeNanos) {
        final boolean doFollowon = priorSearches.size() > 0 && random().nextInt(7) == 1;
        // Pick a random node; we will run the query on this node:
        final int myNodeID = random().nextInt(numNodes);
        final NodeState.ShardIndexSearcher localShardSearcher;
        final PreviousSearchState prevSearchState;
        if (doFollowon) {
            // Pretend user issued a followon query:
            prevSearchState = priorSearches.get(random().nextInt(priorSearches.size()));
            if (VERBOSE) {
                System.out.println("\nTEST: follow-on query age=" + ((System.nanoTime() - prevSearchState.searchTimeNanos) / 1000000000.0));
            }
            try {
                localShardSearcher = nodes[myNodeID].acquire(prevSearchState.versions);
            } catch (SearcherExpiredException see) {
                // searcher w/o telling them...
                if (VERBOSE) {
                    System.out.println("  searcher expired during local shard searcher init: " + see);
                }
                priorSearches.remove(prevSearchState);
                continue;
            }
        } else {
            if (VERBOSE) {
                System.out.println("\nTEST: fresh query");
            }
            // Do fresh query:
            localShardSearcher = nodes[myNodeID].acquire();
            prevSearchState = null;
        }
        final IndexReader[] subs = new IndexReader[numNodes];
        PreviousSearchState searchState = null;
        try {
            // Mock: now make a single reader (MultiReader) from all node
            // searchers.  In a real shard env you can't do this... we
            // do it to confirm results from the shard searcher
            // are correct:
            int docCount = 0;
            try {
                for (int nodeID = 0; nodeID < numNodes; nodeID++) {
                    final long subVersion = localShardSearcher.nodeVersions[nodeID];
                    final IndexSearcher sub = nodes[nodeID].searchers.acquire(subVersion);
                    if (sub == null) {
                        nodeID--;
                        while (nodeID >= 0) {
                            subs[nodeID].decRef();
                            subs[nodeID] = null;
                            nodeID--;
                        }
                        throw new SearcherExpiredException("nodeID=" + nodeID + " version=" + subVersion);
                    }
                    subs[nodeID] = sub.getIndexReader();
                    docCount += subs[nodeID].maxDoc();
                }
            } catch (SearcherExpiredException see) {
                // Expected
                if (VERBOSE) {
                    System.out.println("  searcher expired during mock reader init: " + see);
                }
                continue;
            }
            final IndexReader mockReader = new MultiReader(subs);
            final IndexSearcher mockSearcher = new IndexSearcher(mockReader);
            Query query;
            Sort sort;
            if (prevSearchState != null) {
                query = prevSearchState.query;
                sort = prevSearchState.sort;
            } else {
                if (terms == null && docCount > minDocsToMakeTerms) {
                    // TODO: try to "focus" on high freq terms sometimes too
                    // TODO: maybe also periodically reset the terms...?
                    final TermsEnum termsEnum = MultiFields.getTerms(mockReader, "body").iterator();
                    terms = new ArrayList<>();
                    while (termsEnum.next() != null) {
                        terms.add(BytesRef.deepCopyOf(termsEnum.term()));
                    }
                    if (VERBOSE) {
                        System.out.println("TEST: init terms: " + terms.size() + " terms");
                    }
                    if (terms.size() == 0) {
                        terms = null;
                    }
                }
                if (VERBOSE) {
                    System.out.println("  maxDoc=" + mockReader.maxDoc());
                }
                if (terms != null) {
                    if (random().nextBoolean()) {
                        query = new TermQuery(new Term("body", terms.get(random().nextInt(terms.size()))));
                    } else {
                        final String t = terms.get(random().nextInt(terms.size())).utf8ToString();
                        final String prefix;
                        if (t.length() <= 1) {
                            prefix = t;
                        } else {
                            prefix = t.substring(0, TestUtil.nextInt(random(), 1, 2));
                        }
                        query = new PrefixQuery(new Term("body", prefix));
                    }
                    if (random().nextBoolean()) {
                        sort = null;
                    } else {
                        // TODO: sort by more than 1 field
                        final int what = random().nextInt(3);
                        if (what == 0) {
                            sort = new Sort(SortField.FIELD_SCORE);
                        } else if (what == 1) {
                            // TODO: this sort doesn't merge
                            // correctly... it's tricky because you
                            // could have > 2.1B docs across all shards: 
                            //sort = new Sort(SortField.FIELD_DOC);
                            sort = null;
                        } else if (what == 2) {
                            sort = new Sort(new SortField[] { new SortField("docid_intDV", SortField.Type.INT, random().nextBoolean()) });
                        } else {
                            sort = new Sort(new SortField[] { new SortField("titleDV", SortField.Type.STRING, random().nextBoolean()) });
                        }
                    }
                } else {
                    query = null;
                    sort = null;
                }
            }
            if (query != null) {
                try {
                    searchState = assertSame(mockSearcher, localShardSearcher, query, sort, prevSearchState);
                } catch (SearcherExpiredException see) {
                    // searcher w/o telling them...
                    if (VERBOSE) {
                        System.out.println("  searcher expired during search: " + see);
                        see.printStackTrace(System.out);
                    }
                    // assert prevSearchState != null;
                    if (prevSearchState != null) {
                        priorSearches.remove(prevSearchState);
                    }
                }
            }
        } finally {
            nodes[myNodeID].release(localShardSearcher);
            for (IndexReader sub : subs) {
                if (sub != null) {
                    sub.decRef();
                }
            }
        }
        if (searchState != null && searchState.searchAfterLocal != null && random().nextInt(5) == 3) {
            priorSearches.add(searchState);
            if (priorSearches.size() > 200) {
                Collections.shuffle(priorSearches, random());
                priorSearches.subList(100, priorSearches.size()).clear();
            }
        }
    }
    finish();
}
Also used : ArrayList(java.util.ArrayList) TermsEnum(org.apache.lucene.index.TermsEnum) BytesRef(org.apache.lucene.util.BytesRef) MultiReader(org.apache.lucene.index.MultiReader) Term(org.apache.lucene.index.Term) IndexReader(org.apache.lucene.index.IndexReader)

Example 72 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class FSTOrdTermsWriter method write.

@Override
public void write(Fields fields) throws IOException {
    for (String field : fields) {
        Terms terms = fields.terms(field);
        if (terms == null) {
            continue;
        }
        FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
        boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
        TermsEnum termsEnum = terms.iterator();
        TermsWriter termsWriter = new TermsWriter(fieldInfo);
        long sumTotalTermFreq = 0;
        long sumDocFreq = 0;
        FixedBitSet docsSeen = new FixedBitSet(maxDoc);
        while (true) {
            BytesRef term = termsEnum.next();
            if (term == null) {
                break;
            }
            BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen);
            if (termState != null) {
                termsWriter.finishTerm(term, termState);
                sumTotalTermFreq += termState.totalTermFreq;
                sumDocFreq += termState.docFreq;
            }
        }
        termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
    }
}
Also used : BlockTermState(org.apache.lucene.codecs.BlockTermState) FixedBitSet(org.apache.lucene.util.FixedBitSet) Terms(org.apache.lucene.index.Terms) FieldInfo(org.apache.lucene.index.FieldInfo) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 73 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class BlockTermsWriter method write.

@Override
public void write(Fields fields) throws IOException {
    for (String field : fields) {
        Terms terms = fields.terms(field);
        if (terms == null) {
            continue;
        }
        TermsEnum termsEnum = terms.iterator();
        TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));
        while (true) {
            BytesRef term = termsEnum.next();
            if (term == null) {
                break;
            }
            termsWriter.write(term, termsEnum);
        }
        termsWriter.finish();
    }
}
Also used : Terms(org.apache.lucene.index.Terms) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Example 74 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class SimpleNaiveBayesClassifierTest method testPerformance.

@Test
public void testPerformance() throws Exception {
    MockAnalyzer analyzer = new MockAnalyzer(random());
    LeafReader leafReader = getRandomIndex(analyzer, 100);
    try {
        long trainStart = System.currentTimeMillis();
        SimpleNaiveBayesClassifier simpleNaiveBayesClassifier = new SimpleNaiveBayesClassifier(leafReader, analyzer, null, categoryFieldName, textFieldName);
        long trainEnd = System.currentTimeMillis();
        long trainTime = trainEnd - trainStart;
        assertTrue("training took more than 10s: " + trainTime / 1000 + "s", trainTime < 10000);
        long evaluationStart = System.currentTimeMillis();
        ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader, simpleNaiveBayesClassifier, categoryFieldName, textFieldName, -1);
        assertNotNull(confusionMatrix);
        long evaluationEnd = System.currentTimeMillis();
        long evaluationTime = evaluationEnd - evaluationStart;
        assertTrue("evaluation took more than 2m: " + evaluationTime / 1000 + "s", evaluationTime < 120000);
        double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
        assertTrue("avg classification time: " + avgClassificationTime, 5000 > avgClassificationTime);
        double f1 = confusionMatrix.getF1Measure();
        assertTrue(f1 >= 0d);
        assertTrue(f1 <= 1d);
        double accuracy = confusionMatrix.getAccuracy();
        assertTrue(accuracy >= 0d);
        assertTrue(accuracy <= 1d);
        double recall = confusionMatrix.getRecall();
        assertTrue(recall >= 0d);
        assertTrue(recall <= 1d);
        double precision = confusionMatrix.getPrecision();
        assertTrue(precision >= 0d);
        assertTrue(precision <= 1d);
        Terms terms = MultiFields.getTerms(leafReader, categoryFieldName);
        TermsEnum iterator = terms.iterator();
        BytesRef term;
        while ((term = iterator.next()) != null) {
            String s = term.utf8ToString();
            recall = confusionMatrix.getRecall(s);
            assertTrue(recall >= 0d);
            assertTrue(recall <= 1d);
            precision = confusionMatrix.getPrecision(s);
            assertTrue(precision >= 0d);
            assertTrue(precision <= 1d);
            double f1Measure = confusionMatrix.getF1Measure(s);
            assertTrue(f1Measure >= 0d);
            assertTrue(f1Measure <= 1d);
        }
    } finally {
        leafReader.close();
    }
}
Also used : MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) LeafReader(org.apache.lucene.index.LeafReader) Terms(org.apache.lucene.index.Terms) ConfusionMatrixGenerator(org.apache.lucene.classification.utils.ConfusionMatrixGenerator) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum) Test(org.junit.Test)

Example 75 with TermsEnum

use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.

the class TermCollectingRewrite method collectTerms.

final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
    IndexReaderContext topReaderContext = reader.getContext();
    for (LeafReaderContext context : topReaderContext.leaves()) {
        final Terms terms = context.reader().terms(query.field);
        if (terms == null) {
            // field does not exist
            continue;
        }
        final TermsEnum termsEnum = getTermsEnum(query, terms, collector.attributes);
        assert termsEnum != null;
        if (termsEnum == TermsEnum.EMPTY)
            continue;
        collector.setReaderContext(topReaderContext, context);
        collector.setNextEnum(termsEnum);
        BytesRef bytes;
        while ((bytes = termsEnum.next()) != null) {
            if (!collector.collect(bytes))
                // interrupt whole term collection, so also don't iterate other subReaders
                return;
        }
    }
}
Also used : Terms(org.apache.lucene.index.Terms) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) IndexReaderContext(org.apache.lucene.index.IndexReaderContext) BytesRef(org.apache.lucene.util.BytesRef) TermsEnum(org.apache.lucene.index.TermsEnum)

Aggregations

TermsEnum (org.apache.lucene.index.TermsEnum)155 BytesRef (org.apache.lucene.util.BytesRef)116 Terms (org.apache.lucene.index.Terms)103 PostingsEnum (org.apache.lucene.index.PostingsEnum)52 ArrayList (java.util.ArrayList)31 Term (org.apache.lucene.index.Term)31 IndexReader (org.apache.lucene.index.IndexReader)29 LeafReader (org.apache.lucene.index.LeafReader)28 IOException (java.io.IOException)26 Fields (org.apache.lucene.index.Fields)26 LeafReaderContext (org.apache.lucene.index.LeafReaderContext)25 Document (org.apache.lucene.document.Document)24 Directory (org.apache.lucene.store.Directory)24 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)19 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)18 HashMap (java.util.HashMap)12 HashSet (java.util.HashSet)11 DirectoryReader (org.apache.lucene.index.DirectoryReader)11 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)10 Bits (org.apache.lucene.util.Bits)10