Search in sources :

Example 41 with RandomIndexWriter

use of org.apache.lucene.index.RandomIndexWriter in project lucene-solr by apache.

the class TestPhraseQuery method testRandomPhrases.

public void testRandomPhrases() throws Exception {
    Directory dir = newDirectory();
    Analyzer analyzer = new MockAnalyzer(random());
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setMergePolicy(newLogMergePolicy()));
    List<List<String>> docs = new ArrayList<>();
    Document d = new Document();
    Field f = newTextField("f", "", Field.Store.NO);
    d.add(f);
    Random r = random();
    int NUM_DOCS = atLeast(10);
    for (int i = 0; i < NUM_DOCS; i++) {
        // must be > 4096 so it spans multiple chunks
        int termCount = TestUtil.nextInt(random(), 4097, 8200);
        List<String> doc = new ArrayList<>();
        StringBuilder sb = new StringBuilder();
        while (doc.size() < termCount) {
            if (r.nextInt(5) == 1 || docs.size() == 0) {
                // make new non-empty-string term
                String term;
                while (true) {
                    term = TestUtil.randomUnicodeString(r);
                    if (term.length() > 0) {
                        break;
                    }
                }
                try (TokenStream ts = analyzer.tokenStream("ignore", term)) {
                    CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class);
                    ts.reset();
                    while (ts.incrementToken()) {
                        String text = termAttr.toString();
                        doc.add(text);
                        sb.append(text).append(' ');
                    }
                    ts.end();
                }
            } else {
                // pick existing sub-phrase
                List<String> lastDoc = docs.get(r.nextInt(docs.size()));
                int len = TestUtil.nextInt(r, 1, 10);
                int start = r.nextInt(lastDoc.size() - len);
                for (int k = start; k < start + len; k++) {
                    String t = lastDoc.get(k);
                    doc.add(t);
                    sb.append(t).append(' ');
                }
            }
        }
        docs.add(doc);
        f.setStringValue(sb.toString());
        w.addDocument(d);
    }
    IndexReader reader = w.getReader();
    IndexSearcher s = newSearcher(reader);
    w.close();
    // now search
    int num = atLeast(10);
    for (int i = 0; i < num; i++) {
        int docID = r.nextInt(docs.size());
        List<String> doc = docs.get(docID);
        final int numTerm = TestUtil.nextInt(r, 2, 20);
        final int start = r.nextInt(doc.size() - numTerm);
        PhraseQuery.Builder builder = new PhraseQuery.Builder();
        StringBuilder sb = new StringBuilder();
        for (int t = start; t < start + numTerm; t++) {
            builder.add(new Term("f", doc.get(t)), t);
            sb.append(doc.get(t)).append(' ');
        }
        PhraseQuery pq = builder.build();
        TopDocs hits = s.search(pq, NUM_DOCS);
        boolean found = false;
        for (int j = 0; j < hits.scoreDocs.length; j++) {
            if (hits.scoreDocs[j].doc == docID) {
                found = true;
                break;
            }
        }
        assertTrue("phrase '" + sb + "' not found; start=" + start + ", it=" + i + ", expected doc " + docID, found);
    }
    reader.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Random(java.util.Random) IndexReader(org.apache.lucene.index.IndexReader) ArrayList(java.util.ArrayList) List(java.util.List) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 42 with RandomIndexWriter

use of org.apache.lucene.index.RandomIndexWriter in project lucene-solr by apache.

the class TestPhraseQuery method testPhraseQueryWithStopAnalyzer.

public void testPhraseQueryWithStopAnalyzer() throws Exception {
    Directory directory = newDirectory();
    Analyzer stopAnalyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
    RandomIndexWriter writer = new RandomIndexWriter(random(), directory, newIndexWriterConfig(stopAnalyzer));
    Document doc = new Document();
    doc.add(newTextField("field", "the stop words are here", Field.Store.YES));
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    writer.close();
    IndexSearcher searcher = newSearcher(reader);
    // valid exact phrase query
    PhraseQuery query = new PhraseQuery("field", "stop", "words");
    ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs;
    assertEquals(1, hits.length);
    QueryUtils.check(random(), query, searcher);
    reader.close();
    directory.close();
}
Also used : MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) IndexReader(org.apache.lucene.index.IndexReader) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 43 with RandomIndexWriter

use of org.apache.lucene.index.RandomIndexWriter in project lucene-solr by apache.

the class TestPhraseQuery method testZeroPosIncr.

/** Tests PhraseQuery with terms at the same position in the query. */
public void testZeroPosIncr() throws IOException {
    Directory dir = newDirectory();
    final Token[] tokens = new Token[3];
    tokens[0] = new Token();
    tokens[0].append("a");
    tokens[0].setPositionIncrement(1);
    tokens[1] = new Token();
    tokens[1].append("aa");
    tokens[1].setPositionIncrement(0);
    tokens[2] = new Token();
    tokens[2].append("b");
    tokens[2].setPositionIncrement(1);
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(new TextField("field", new CannedTokenStream(tokens)));
    writer.addDocument(doc);
    IndexReader r = writer.getReader();
    writer.close();
    IndexSearcher searcher = newSearcher(r);
    // Sanity check; simple "a b" phrase:
    PhraseQuery.Builder pqBuilder = new PhraseQuery.Builder();
    pqBuilder.add(new Term("field", "a"), 0);
    pqBuilder.add(new Term("field", "b"), 1);
    assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
    // Now with "a|aa b"
    pqBuilder = new PhraseQuery.Builder();
    pqBuilder.add(new Term("field", "a"), 0);
    pqBuilder.add(new Term("field", "aa"), 0);
    pqBuilder.add(new Term("field", "b"), 1);
    assertEquals(1, searcher.search(pqBuilder.build(), 1).totalHits);
    // Now with "a|z b" which should not match; this isn't a MultiPhraseQuery
    pqBuilder = new PhraseQuery.Builder();
    pqBuilder.add(new Term("field", "a"), 0);
    pqBuilder.add(new Term("field", "z"), 0);
    pqBuilder.add(new Term("field", "b"), 1);
    assertEquals(0, searcher.search(pqBuilder.build(), 1).totalHits);
    r.close();
    dir.close();
}
Also used : Token(org.apache.lucene.analysis.Token) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 44 with RandomIndexWriter

use of org.apache.lucene.index.RandomIndexWriter in project lucene-solr by apache.

the class TestMultiCollector method testCollectionTerminatedExceptionHandling.

public void testCollectionTerminatedExceptionHandling() throws IOException {
    final int iters = atLeast(3);
    for (int iter = 0; iter < iters; ++iter) {
        Directory dir = newDirectory();
        RandomIndexWriter w = new RandomIndexWriter(random(), dir);
        final int numDocs = TestUtil.nextInt(random(), 100, 1000);
        final Document doc = new Document();
        for (int i = 0; i < numDocs; ++i) {
            w.addDocument(doc);
        }
        final IndexReader reader = w.getReader();
        w.close();
        final IndexSearcher searcher = newSearcher(reader);
        Map<TotalHitCountCollector, Integer> expectedCounts = new HashMap<>();
        List<Collector> collectors = new ArrayList<>();
        final int numCollectors = TestUtil.nextInt(random(), 1, 5);
        for (int i = 0; i < numCollectors; ++i) {
            final int terminateAfter = random().nextInt(numDocs + 10);
            final int expectedCount = terminateAfter > numDocs ? numDocs : terminateAfter;
            TotalHitCountCollector collector = new TotalHitCountCollector();
            expectedCounts.put(collector, expectedCount);
            collectors.add(new TerminateAfterCollector(collector, terminateAfter));
        }
        searcher.search(new MatchAllDocsQuery(), MultiCollector.wrap(collectors));
        for (Map.Entry<TotalHitCountCollector, Integer> expectedCount : expectedCounts.entrySet()) {
            assertEquals(expectedCount.getValue().intValue(), expectedCount.getKey().getTotalHits());
        }
        reader.close();
        dir.close();
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Document(org.apache.lucene.document.Document) IndexReader(org.apache.lucene.index.IndexReader) HashMap(java.util.HashMap) Map(java.util.Map) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 45 with RandomIndexWriter

use of org.apache.lucene.index.RandomIndexWriter in project lucene-solr by apache.

the class TestMultiPhraseQuery method testTall.

// LUCENE-2580
public void testTall() throws IOException {
    Directory indexStore = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), indexStore);
    add("blueberry chocolate pie", writer);
    add("blueberry chocolate tart", writer);
    IndexReader r = writer.getReader();
    writer.close();
    IndexSearcher searcher = newSearcher(r);
    MultiPhraseQuery.Builder qb = new MultiPhraseQuery.Builder();
    qb.add(new Term("body", "blueberry"));
    qb.add(new Term("body", "chocolate"));
    qb.add(new Term[] { new Term("body", "pie"), new Term("body", "tart") });
    assertEquals(2, searcher.search(qb.build(), 1).totalHits);
    r.close();
    indexStore.close();
}
Also used : IndexReader(org.apache.lucene.index.IndexReader) Term(org.apache.lucene.index.Term) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory)

Aggregations

RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)775 Document (org.apache.lucene.document.Document)675 Directory (org.apache.lucene.store.Directory)584 IndexReader (org.apache.lucene.index.IndexReader)508 Term (org.apache.lucene.index.Term)324 IndexSearcher (org.apache.lucene.search.IndexSearcher)294 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)220 BytesRef (org.apache.lucene.util.BytesRef)142 Field (org.apache.lucene.document.Field)140 MatchAllDocsQuery (org.apache.lucene.search.MatchAllDocsQuery)136 TopDocs (org.apache.lucene.search.TopDocs)134 TermQuery (org.apache.lucene.search.TermQuery)121 DirectoryReader (org.apache.lucene.index.DirectoryReader)119 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)110 ArrayList (java.util.ArrayList)91 StringField (org.apache.lucene.document.StringField)89 Analyzer (org.apache.lucene.analysis.Analyzer)88 BooleanQuery (org.apache.lucene.search.BooleanQuery)88 NumericDocValuesField (org.apache.lucene.document.NumericDocValuesField)76 Query (org.apache.lucene.search.Query)73