use of org.apache.lucene.index.RandomIndexWriter in project lucene-solr by apache.
the class TestPhraseQuery method testRandomPhrases.
public void testRandomPhrases() throws Exception {
Directory dir = newDirectory();
Analyzer analyzer = new MockAnalyzer(random());
RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy()));
List<List<String>> docs = new ArrayList<>();
Document d = new Document();
Field f = newTextField("f", "", Field.Store.NO);
d.add(f);
Random r = random();
int NUM_DOCS = atLeast(10);
for (int i = 0; i < NUM_DOCS; i++) {
// must be > 4096 so it spans multiple chunks
int termCount = TestUtil.nextInt(random(), 4097, 8200);
List<String> doc = new ArrayList<>();
StringBuilder sb = new StringBuilder();
while (doc.size() < termCount) {
if (r.nextInt(5) == 1 || docs.size() == 0) {
// make new non-empty-string term
String term;
while (true) {
term = TestUtil.randomUnicodeString(r);
if (term.length() > 0) {
break;
}
}
try (TokenStream ts = analyzer.tokenStream("ignore", term)) {
CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
ts.reset();
while (ts.incrementToken()) {
String text = termAttr.toString();
doc.add(text);
sb.append(text).append(' ');
}
ts.end();
}
} else {
// pick existing sub-phrase
List<String> lastDoc = docs.get(r.nextInt(docs.size()));
int len = TestUtil.nextInt(r, 1, 10);
int start = r.nextInt(lastDoc.size() - len);
for (int k = start; k < start + len; k++) {
String t = lastDoc.get(k);
doc.add(t);
sb.append(t).append(' ');
}
}
}
docs.add(doc);
f.setStringValue(sb.toString());
w.addDocument(d);
}
IndexReader reader = w.getReader();
IndexSearcher s = newSearcher(reader);
w.close();
// now search
int num = atLeast(10);
for (int i = 0; i < num; i++) {
int docID = r.nextInt(docs.size());
List<String> doc = docs.get(docID);
final int numTerm = TestUtil.nextInt(r, 2, 20);
final int start = r.nextInt(doc.size() - numTerm);
PhraseQuery.Builder builder = new PhraseQuery.Builder();
StringBuilder sb = new StringBuilder();
for (int t = start; t < start + numTerm; t++) {
builder.add(new Term("f", doc.get(t)), t);
sb.append(doc.get(t)).append(' ');
}
PhraseQuery pq = builder.build();
TopDocs hits = s.search(pq, NUM_DOCS);
boolean found = false;
for (int j = 0; j < hits.scoreDocs.length; j++) {
if (hits.scoreDocs[j].doc == docID) {
found = true;
break;
}
}
assertTrue("phrase '" + sb + "' not found; start=" + start + ", it=" + i + ", expected doc " + docID, found);
}
reader.close();
dir.close();
}
use of org.apache.lucene.index.RandomIndexWriter in project lucene-solr by apache.
the class TestPhraseQuery method testPhraseQueryWithStopAnalyzer.
public void testPhraseQueryWithStopAnalyzer() throws Exception {
Directory directory = newDirectory();
Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(stopAnalyzer));
Document doc = new Document();
doc.add(newTextField("field", "the stop words are here", Field.Store.YES));
writer.addDocument(doc);
IndexReader reader = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(reader);
// valid exact phrase query
PhraseQuery query = new PhraseQuery("field", "stop", "words");
ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
assertEquals(1, hits.length);
QueryUtils.check(random(), query, searcher);
reader.close();
directory.close();
}
use of org.apache.lucene.index.RandomIndexWriter in project lucene-solr by apache.
the class TestPhraseQuery method testZeroPosIncr.
/** Tests PhraseQuery with terms at the same position in the query. */
public void testZeroPosIncr() throws IOException {
Directory dir = newDirectory();
final Token[] tokens = new Token[3];
tokens[0] = new Token();
tokens[0].append("a");
tokens[0].setPositionIncrement(1);
tokens[1] = new Token();
tokens[1].append("aa");
tokens[1].setPositionIncrement(0);
tokens[2] = new Token();
tokens[2].append("b");
tokens[2].setPositionIncrement(1);
RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
Document doc = new Document();
doc.add(new TextField("field", new CannedTokenStream(tokens)));
writer.addDocument(doc);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(r);
// Sanity check; simple "a b" phrase:
PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
// Now with "a|aa b"
pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "aa"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
// Now with "a|z b" which should not match; this isn't a MultiPhraseQuery
pqBuilder = new PhraseQuery.Builder();
pqBuilder.add(new Term("field", "a"), 0);
pqBuilder.add(new Term("field", "z"), 0);
pqBuilder.add(new Term("field", "b"), 1);
assertEquals(0, searcher.search(pqBuilder.build(), 1).totalHits);
r.close();
dir.close();
}
use of org.apache.lucene.index.RandomIndexWriter in project lucene-solr by apache.
the class TestMultiCollector method testCollectionTerminatedExceptionHandling.
public void testCollectionTerminatedExceptionHandling() throws IOException {
final int iters = atLeast(3);
for (int iter = 0; iter < iters; ++iter) {
Directory dir = newDirectory();
RandomIndexWriter w = new RandomIndexWriter(random(), dir);
final int numDocs = TestUtil.nextInt(random(), 100, 1000);
final Document doc = new Document();
for (int i = 0; i < numDocs; ++i) {
w.addDocument(doc);
}
final IndexReader reader = w.getReader();
w.close();
final IndexSearcher searcher = newSearcher(reader);
Map<TotalHitCountCollector, Integer> expectedCounts = new HashMap<>();
List<Collector> collectors = new ArrayList<>();
final int numCollectors = TestUtil.nextInt(random(), 1, 5);
for (int i = 0; i < numCollectors; ++i) {
final int terminateAfter = random().nextInt(numDocs + 10);
final int expectedCount = terminateAfter > numDocs ? numDocs : terminateAfter;
TotalHitCountCollector collector = new TotalHitCountCollector();
expectedCounts.put(collector, expectedCount);
collectors.add(new TerminateAfterCollector(collector, terminateAfter));
}
searcher.search(new MatchAllDocsQuery(), MultiCollector.wrap(collectors));
for (Map.Entry<TotalHitCountCollector, Integer> expectedCount : expectedCounts.entrySet()) {
assertEquals(expectedCount.getValue().intValue(), expectedCount.getKey().getTotalHits());
}
reader.close();
dir.close();
}
}
use of org.apache.lucene.index.RandomIndexWriter in project lucene-solr by apache.
the class TestMultiPhraseQuery method testTall.
// LUCENE-2580
public void testTall() throws IOException {
Directory indexStore = newDirectory();
RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
add("blueberry chocolate pie", writer);
add("blueberry chocolate tart", writer);
IndexReader r = writer.getReader();
writer.close();
IndexSearcher searcher = newSearcher(r);
MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder();
qb.add(new Term("body", "blueberry"));
qb.add(new Term("body", "chocolate"));
qb.add(new Term[] { new Term("body", "pie"), new Term("body", "tart") });
assertEquals(2, searcher.search(qb.build(), 1).totalHits);
r.close();
indexStore.close();
}
Aggregations