use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TestShardSearching method testSimple.
public void testSimple() throws Exception {
final int numNodes = TestUtil.nextInt(random(), 1, 10);
final double runTimeSec = atLeast(3);
final int minDocsToMakeTerms = TestUtil.nextInt(random(), 5, 20);
final int maxSearcherAgeSeconds = TestUtil.nextInt(random(), 1, 3);
if (VERBOSE) {
System.out.println("TEST: numNodes=" + numNodes + " runTimeSec=" + runTimeSec + " maxSearcherAgeSeconds=" + maxSearcherAgeSeconds);
}
start(numNodes, runTimeSec, maxSearcherAgeSeconds);
final List<PreviousSearchState> priorSearches = new ArrayList<>();
List<BytesRef> terms = null;
while (System.nanoTime() < endTimeNanos) {
final boolean doFollowon = priorSearches.size() > 0 && random().nextInt(7) == 1;
// Pick a random node; we will run the query on this node:
final int myNodeID = random().nextInt(numNodes);
final NodeState.ShardIndexSearcher localShardSearcher;
final PreviousSearchState prevSearchState;
if (doFollowon) {
// Pretend user issued a followon query:
prevSearchState = priorSearches.get(random().nextInt(priorSearches.size()));
if (VERBOSE) {
System.out.println("\nTEST: follow-on query age=" + ((System.nanoTime() - prevSearchState.searchTimeNanos) / 1000000000.0));
}
try {
localShardSearcher = nodes[myNodeID].acquire(prevSearchState.versions);
} catch (SearcherExpiredException see) {
// searcher w/o telling them...
if (VERBOSE) {
System.out.println(" searcher expired during local shard searcher init: " + see);
}
priorSearches.remove(prevSearchState);
continue;
}
} else {
if (VERBOSE) {
System.out.println("\nTEST: fresh query");
}
// Do fresh query:
localShardSearcher = nodes[myNodeID].acquire();
prevSearchState = null;
}
final IndexReader[] subs = new IndexReader[numNodes];
PreviousSearchState searchState = null;
try {
// Mock: now make a single reader (MultiReader) from all node
// searchers. In a real shard env you can't do this... we
// do it to confirm results from the shard searcher
// are correct:
int docCount = 0;
try {
for (int nodeID = 0; nodeID < numNodes; nodeID++) {
final long subVersion = localShardSearcher.nodeVersions[nodeID];
final IndexSearcher sub = nodes[nodeID].searchers.acquire(subVersion);
if (sub == null) {
nodeID--;
while (nodeID >= 0) {
subs[nodeID].decRef();
subs[nodeID] = null;
nodeID--;
}
throw new SearcherExpiredException("nodeID=" + nodeID + " version=" + subVersion);
}
subs[nodeID] = sub.getIndexReader();
docCount += subs[nodeID].maxDoc();
}
} catch (SearcherExpiredException see) {
// Expected
if (VERBOSE) {
System.out.println(" searcher expired during mock reader init: " + see);
}
continue;
}
final IndexReader mockReader = new MultiReader(subs);
final IndexSearcher mockSearcher = new IndexSearcher(mockReader);
Query query;
Sort sort;
if (prevSearchState != null) {
query = prevSearchState.query;
sort = prevSearchState.sort;
} else {
if (terms == null && docCount > minDocsToMakeTerms) {
// TODO: try to "focus" on high freq terms sometimes too
// TODO: maybe also periodically reset the terms...?
final TermsEnum termsEnum = MultiFields.getTerms(mockReader, "body").iterator();
terms = new ArrayList<>();
while (termsEnum.next() != null) {
terms.add(BytesRef.deepCopyOf(termsEnum.term()));
}
if (VERBOSE) {
System.out.println("TEST: init terms: " + terms.size() + " terms");
}
if (terms.size() == 0) {
terms = null;
}
}
if (VERBOSE) {
System.out.println(" maxDoc=" + mockReader.maxDoc());
}
if (terms != null) {
if (random().nextBoolean()) {
query = new TermQuery(new Term("body", terms.get(random().nextInt(terms.size()))));
} else {
final String t = terms.get(random().nextInt(terms.size())).utf8ToString();
final String prefix;
if (t.length() <= 1) {
prefix = t;
} else {
prefix = t.substring(0, TestUtil.nextInt(random(), 1, 2));
}
query = new PrefixQuery(new Term("body", prefix));
}
if (random().nextBoolean()) {
sort = null;
} else {
// TODO: sort by more than 1 field
final int what = random().nextInt(3);
if (what == 0) {
sort = new Sort(SortField.FIELD_SCORE);
} else if (what == 1) {
// TODO: this sort doesn't merge
// correctly... it's tricky because you
// could have > 2.1B docs across all shards:
//sort = new Sort(SortField.FIELD_DOC);
sort = null;
} else if (what == 2) {
sort = new Sort(new SortField[] { new SortField("docid_intDV", SortField.Type.INT, random().nextBoolean()) });
} else {
sort = new Sort(new SortField[] { new SortField("titleDV", SortField.Type.STRING, random().nextBoolean()) });
}
}
} else {
query = null;
sort = null;
}
}
if (query != null) {
try {
searchState = assertSame(mockSearcher, localShardSearcher, query, sort, prevSearchState);
} catch (SearcherExpiredException see) {
// searcher w/o telling them...
if (VERBOSE) {
System.out.println(" searcher expired during search: " + see);
see.printStackTrace(System.out);
}
// assert prevSearchState != null;
if (prevSearchState != null) {
priorSearches.remove(prevSearchState);
}
}
}
} finally {
nodes[myNodeID].release(localShardSearcher);
for (IndexReader sub : subs) {
if (sub != null) {
sub.decRef();
}
}
}
if (searchState != null && searchState.searchAfterLocal != null && random().nextInt(5) == 3) {
priorSearches.add(searchState);
if (priorSearches.size() > 200) {
Collections.shuffle(priorSearches, random());
priorSearches.subList(100, priorSearches.size()).clear();
}
}
}
finish();
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class FSTOrdTermsWriter method write.
@Override
public void write(Fields fields) throws IOException {
for (String field : fields) {
Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
TermsEnum termsEnum = terms.iterator();
TermsWriter termsWriter = new TermsWriter(fieldInfo);
long sumTotalTermFreq = 0;
long sumDocFreq = 0;
FixedBitSet docsSeen = new FixedBitSet(maxDoc);
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen);
if (termState != null) {
termsWriter.finishTerm(term, termState);
sumTotalTermFreq += termState.totalTermFreq;
sumDocFreq += termState.docFreq;
}
}
termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
}
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class BlockTermsWriter method write.
@Override
public void write(Fields fields) throws IOException {
for (String field : fields) {
Terms terms = fields.terms(field);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator();
TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));
while (true) {
BytesRef term = termsEnum.next();
if (term == null) {
break;
}
termsWriter.write(term, termsEnum);
}
termsWriter.finish();
}
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class SimpleNaiveBayesClassifierTest method testPerformance.
@Test
public void testPerformance() throws Exception {
MockAnalyzer analyzer = new MockAnalyzer(random());
LeafReader leafReader = getRandomIndex(analyzer, 100);
try {
long trainStart = System.currentTimeMillis();
SimpleNaiveBayesClassifier simpleNaiveBayesClassifier = new SimpleNaiveBayesClassifier(leafReader, analyzer, null, categoryFieldName, textFieldName);
long trainEnd = System.currentTimeMillis();
long trainTime = trainEnd - trainStart;
assertTrue("training took more than 10s: " + trainTime / 1000 + "s", trainTime < 10000);
long evaluationStart = System.currentTimeMillis();
ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(leafReader, simpleNaiveBayesClassifier, categoryFieldName, textFieldName, -1);
assertNotNull(confusionMatrix);
long evaluationEnd = System.currentTimeMillis();
long evaluationTime = evaluationEnd - evaluationStart;
assertTrue("evaluation took more than 2m: " + evaluationTime / 1000 + "s", evaluationTime < 120000);
double avgClassificationTime = confusionMatrix.getAvgClassificationTime();
assertTrue("avg classification time: " + avgClassificationTime, 5000 > avgClassificationTime);
double f1 = confusionMatrix.getF1Measure();
assertTrue(f1 >= 0d);
assertTrue(f1 <= 1d);
double accuracy = confusionMatrix.getAccuracy();
assertTrue(accuracy >= 0d);
assertTrue(accuracy <= 1d);
double recall = confusionMatrix.getRecall();
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
double precision = confusionMatrix.getPrecision();
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
Terms terms = MultiFields.getTerms(leafReader, categoryFieldName);
TermsEnum iterator = terms.iterator();
BytesRef term;
while ((term = iterator.next()) != null) {
String s = term.utf8ToString();
recall = confusionMatrix.getRecall(s);
assertTrue(recall >= 0d);
assertTrue(recall <= 1d);
precision = confusionMatrix.getPrecision(s);
assertTrue(precision >= 0d);
assertTrue(precision <= 1d);
double f1Measure = confusionMatrix.getF1Measure(s);
assertTrue(f1Measure >= 0d);
assertTrue(f1Measure <= 1d);
}
} finally {
leafReader.close();
}
}
use of org.apache.lucene.index.TermsEnum in project lucene-solr by apache.
the class TermCollectingRewrite method collectTerms.
final void collectTerms(IndexReader reader, MultiTermQuery query, TermCollector collector) throws IOException {
IndexReaderContext topReaderContext = reader.getContext();
for (LeafReaderContext context : topReaderContext.leaves()) {
final Terms terms = context.reader().terms(query.field);
if (terms == null) {
// field does not exist
continue;
}
final TermsEnum termsEnum = getTermsEnum(query, terms, collector.attributes);
assert termsEnum != null;
if (termsEnum == TermsEnum.EMPTY)
continue;
collector.setReaderContext(topReaderContext, context);
collector.setNextEnum(termsEnum);
BytesRef bytes;
while ((bytes = termsEnum.next()) != null) {
if (!collector.collect(bytes))
// interrupt whole term collection, so also don't iterate other subReaders
return;
}
}
}
Aggregations