Search in sources :

Example 51 with PhraseQuery

use of org.apache.lucene.search.PhraseQuery in project greplin-lucene-utils by Cue.

the class PhraseFilterBenchmark method main.

public static void main(String[] argv) {
    Directory directory = new RAMDirectory();
    try {
        IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_32, new WhitespaceAnalyzer(Version.LUCENE_32)));
        int done = 0;
        for (int i = 0; i < NUMBER_OF_SEGMENTS; i++) {
            int remaining = NUMBER_OF_SEGMENTS - i;
            int numberOfDocs;
            if (remaining == 1) {
                numberOfDocs = TOTAL_DOCS - done;
            } else {
                numberOfDocs = RANDOM.nextInt(TOTAL_DOCS - done - remaining) + 1;
            }
            done += numberOfDocs;
            System.out.println("Segment #" + i + " has " + numberOfDocs + " docs");
            for (int d = 0; d < numberOfDocs; d++) {
                int wordCount = RANDOM.nextInt(WORDS_PER_DOC_DEVIATION * 2) + AVERAGE_WORDS_PER_DOC - WORDS_PER_DOC_DEVIATION;
                Document doc = new Document();
                doc.add(new Field("f", Joiner.on(' ').join(words(wordCount)), Field.Store.YES, Field.Index.ANALYZED));
                doc.add(new Field("second", RANDOM.nextInt(100) < SECOND_FIELD_MATCH_PERCENTAGE ? "yes" : "no", Field.Store.NO, Field.Index.ANALYZED));
                writer.addDocument(doc);
            }
            writer.commit();
        }
        writer.close();
        IndexReader reader = IndexReader.open(directory);
        IndexSearcher searcher = new IndexSearcher(reader);
        String[][] queries = new String[TOTAL_QUERIES][];
        Term[][] terms = new Term[TOTAL_QUERIES][];
        for (int q = 0; q < TOTAL_QUERIES; q++) {
            queries[q] = words(WORDS_PER_QUERY[RANDOM.nextInt(WORDS_PER_QUERY.length)]);
            terms[q] = new Term[queries[q].length];
            for (int qw = 0; qw < queries[q].length; qw++) {
                terms[q][qw] = new Term(FIELD, queries[q][qw]);
            }
        }
        // Warm up.
        new PhraseFilter(FIELD, queries[0]).getDocIdSet(reader);
        for (int round = 0; round < ROUNDS; round++) {
            System.out.println();
            String name1 = "filter";
            String name2 = "query";
            long ms1 = 0, ms2 = 0;
            for (int step = 0; step < 2; step++) {
                System.gc();
                System.gc();
                System.gc();
                if (step == (round & 1)) {
                    long millis = System.currentTimeMillis();
                    long hits = 0;
                    for (String[] queryWords : queries) {
                        PhraseFilter pf = new PhraseFilter(new FilterIntersectionProvider(TermsFilter.from(new Term("second", "yes"))), FIELD, queryWords);
                        hits += searcher.search(new FilteredQuery(new MatchAllDocsQuery(), pf), 1).totalHits;
                    }
                    ms1 = System.currentTimeMillis() - millis;
                    System.out.println("Finished " + name1 + " in " + ms1 + "ms with " + hits + " hits");
                } else {
                    long millis = System.currentTimeMillis();
                    long hits = 0;
                    for (Term[] queryTerms : terms) {
                        PhraseQuery pq = new PhraseQuery();
                        for (Term term : queryTerms) {
                            pq.add(term);
                        }
                        Query query = BooleanQueryBuilder.builder().must(new TermQuery(new Term("second", "yes"))).must(pq).build();
                        hits += searcher.search(query, 1).totalHits;
                    }
                    ms2 = System.currentTimeMillis() - millis;
                    System.out.println("Finished " + name2 + " in " + ms2 + "ms with " + hits + " hits");
                }
            }
            System.out.println(name1 + " took " + (int) ((100.0 * ms1) / ms2) + "% as much time as " + name2);
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Query(org.apache.lucene.search.Query) FilteredQuery(org.apache.lucene.search.FilteredQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) Document(org.apache.lucene.document.Document) FilteredQuery(org.apache.lucene.search.FilteredQuery) Field(org.apache.lucene.document.Field) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) WhitespaceAnalyzer(org.apache.lucene.analysis.WhitespaceAnalyzer) TermQuery(org.apache.lucene.search.TermQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) FilterIntersectionProvider(com.greplin.lucene.util.FilterIntersectionProvider) Term(org.apache.lucene.index.Term) IOException(java.io.IOException) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) RAMDirectory(org.apache.lucene.store.RAMDirectory) IndexWriter(org.apache.lucene.index.IndexWriter) IndexReader(org.apache.lucene.index.IndexReader) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 52 with PhraseQuery

use of org.apache.lucene.search.PhraseQuery in project crate by crate.

the class MultiMatchQuery method blendPhrase.

/**
 * Expand a {@link PhraseQuery} to multiple fields that share the same analyzer.
 * Returns a {@link DisjunctionMaxQuery} with a disjunction for each expanded field.
 */
static Query blendPhrase(PhraseQuery query, float tiebreaker, FieldAndFieldType... fields) {
    List<Query> disjunctions = new ArrayList<>();
    for (FieldAndFieldType field : fields) {
        int[] positions = query.getPositions();
        Term[] terms = query.getTerms();
        PhraseQuery.Builder builder = new PhraseQuery.Builder();
        for (int i = 0; i < terms.length; i++) {
            builder.add(new Term(field.fieldType.name(), terms[i].bytes()), positions[i]);
        }
        Query q = builder.build();
        if (field.boost != MultiMatchQuery.DEFAULT_BOOST) {
            q = new BoostQuery(q, field.boost);
        }
        disjunctions.add(q);
    }
    return new DisjunctionMaxQuery(disjunctions, tiebreaker);
}
Also used : Query(org.apache.lucene.search.Query) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) BlendedTermQuery(org.apache.lucene.queries.BlendedTermQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) TermQuery(org.apache.lucene.search.TermQuery) BoostQuery(org.apache.lucene.search.BoostQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) BoostQuery(org.apache.lucene.search.BoostQuery)

Example 53 with PhraseQuery

use of org.apache.lucene.search.PhraseQuery in project zm-mailbox by Zimbra.

the class AbstractIndexStoreTest method phraseQueryWithStopWord.

@Test
public void phraseQueryWithStopWord() throws Exception {
    ZimbraLog.test.debug("--->TEST phraseQueryWithStopWord");
    Mailbox mbox = MailboxManager.getInstance().getMailboxByAccountId(MockProvisioning.DEFAULT_ACCOUNT_ID);
    createContact(mbox, "Non", "Match", "nOn.MaTchiNg@zimbra.com");
    Contact contact2 = createContact(mbox, "First", "Last", "f.last@zimbra.com", "1066 and all that with William the conqueror and others");
    createContact(mbox, "Given", "Surname", "GiV.SurN@zimbra.com");
    // Make sure all indexing has been done
    mbox.index.indexDeferredItems();
    IndexStore index = mbox.index.getIndexStore();
    ZimbraIndexSearcher searcher = index.openSearcher();
    PhraseQuery pquery = new PhraseQuery();
    // Lower case required for each term for Lucene
    pquery.add(new Term(LuceneFields.L_CONTENT, "william"));
    // pquery.add(new Term(LuceneFields.L_CONTENT, "the")); - excluded because it is a stop word
    pquery.add(new Term(LuceneFields.L_CONTENT, "conqueror"));
    ZimbraTopDocs result = searcher.search(pquery, 100);
    Assert.assertNotNull("searcher.search result object", result);
    ZimbraLog.test.debug("Result for search [hits=%d]:%s", result.getTotalHits(), result.toString());
    Assert.assertEquals("Number of hits", 1, result.getTotalHits());
    String expected1Id = String.valueOf(contact2.getId());
    String match1Id = getBlobIdForResultDoc(searcher, result, 0);
    Assert.assertEquals("Mailbox Blob ID of match", expected1Id, match1Id);
}
Also used : Mailbox(com.zimbra.cs.mailbox.Mailbox) PhraseQuery(org.apache.lucene.search.PhraseQuery) MultiPhraseQuery(org.apache.lucene.search.MultiPhraseQuery) Term(org.apache.lucene.index.Term) Contact(com.zimbra.cs.mailbox.Contact) ParsedContact(com.zimbra.cs.mime.ParsedContact) Test(org.junit.Test)

Example 54 with PhraseQuery

use of org.apache.lucene.search.PhraseQuery in project textdb by TextDB.

the class KeywordMatcherSourceOperator method buildPhraseQuery.

private Query buildPhraseQuery() throws DataflowException {
    BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
    for (String attributeName : this.predicate.getAttributeNames()) {
        AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
        // types other than TEXT and STRING: throw Exception for now
        if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
            throw new DataflowException("KeywordPredicate: Fields other than STRING and TEXT are not supported yet");
        }
        if (attributeType == AttributeType.STRING) {
            Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery()));
            booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
        }
        if (attributeType == AttributeType.TEXT) {
            if (queryTokenList.size() == 1) {
                Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery().toLowerCase()));
                booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
            } else {
                PhraseQuery.Builder phraseQueryBuilder = new PhraseQuery.Builder();
                for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
                    if (!StandardAnalyzer.STOP_WORDS_SET.contains(queryTokensWithStopwords.get(i))) {
                        phraseQueryBuilder.add(new Term(attributeName, queryTokensWithStopwords.get(i).toLowerCase()), i);
                    }
                }
                PhraseQuery phraseQuery = phraseQueryBuilder.build();
                booleanQueryBuilder.add(phraseQuery, BooleanClause.Occur.SHOULD);
            }
        }
    }
    return booleanQueryBuilder.build();
}
Also used : BooleanQuery(org.apache.lucene.search.BooleanQuery) TermQuery(org.apache.lucene.search.TermQuery) Query(org.apache.lucene.search.Query) PhraseQuery(org.apache.lucene.search.PhraseQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) AttributeType(edu.uci.ics.texera.api.schema.AttributeType) DataflowException(edu.uci.ics.texera.api.exception.DataflowException) Term(org.apache.lucene.index.Term)

Example 55 with PhraseQuery

use of org.apache.lucene.search.PhraseQuery in project elasticsearch by elastic.

the class QueryStringQueryBuilderTests method testToQueryPhraseQuery.

public void testToQueryPhraseQuery() throws IOException {
    assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
    Query query = queryStringQuery("\"term1 term2\"").defaultField(STRING_FIELD_NAME).phraseSlop(3).toQuery(createShardContext());
    assertThat(query, instanceOf(DisjunctionMaxQuery.class));
    DisjunctionMaxQuery disjunctionMaxQuery = (DisjunctionMaxQuery) query;
    assertThat(disjunctionMaxQuery.getDisjuncts().size(), equalTo(1));
    assertThat(disjunctionMaxQuery.getDisjuncts().get(0), instanceOf(PhraseQuery.class));
    PhraseQuery phraseQuery = (PhraseQuery) disjunctionMaxQuery.getDisjuncts().get(0);
    assertThat(phraseQuery.getTerms().length, equalTo(2));
    assertThat(phraseQuery.getTerms()[0], equalTo(new Term(STRING_FIELD_NAME, "term1")));
    assertThat(phraseQuery.getTerms()[1], equalTo(new Term(STRING_FIELD_NAME, "term2")));
    assertThat(phraseQuery.getSlop(), equalTo(3));
}
Also used : Query(org.apache.lucene.search.Query) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) RegexpQuery(org.apache.lucene.search.RegexpQuery) MultiTermQuery(org.apache.lucene.search.MultiTermQuery) AllTermQuery(org.elasticsearch.common.lucene.all.AllTermQuery) PrefixQuery(org.apache.lucene.search.PrefixQuery) FuzzyQuery(org.apache.lucene.search.FuzzyQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) WildcardQuery(org.apache.lucene.search.WildcardQuery) ElasticsearchAssertions.assertBooleanSubQuery(org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertBooleanSubQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) ConstantScoreQuery(org.apache.lucene.search.ConstantScoreQuery) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) TermQuery(org.apache.lucene.search.TermQuery) SynonymQuery(org.apache.lucene.search.SynonymQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) QueryBuilders.queryStringQuery(org.elasticsearch.index.query.QueryBuilders.queryStringQuery) BoostQuery(org.apache.lucene.search.BoostQuery) TermRangeQuery(org.apache.lucene.search.TermRangeQuery) SpanOrQuery(org.apache.lucene.search.spans.SpanOrQuery) PhraseQuery(org.apache.lucene.search.PhraseQuery) DisjunctionMaxQuery(org.apache.lucene.search.DisjunctionMaxQuery) Term(org.apache.lucene.index.Term)

Aggregations

PhraseQuery (org.apache.lucene.search.PhraseQuery)105 Term (org.apache.lucene.index.Term)56 TermQuery (org.apache.lucene.search.TermQuery)43 BooleanQuery (org.apache.lucene.search.BooleanQuery)39 MultiPhraseQuery (org.apache.lucene.search.MultiPhraseQuery)37 Document (org.apache.lucene.document.Document)36 Query (org.apache.lucene.search.Query)30 Directory (org.apache.lucene.store.Directory)26 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)25 IndexSearcher (org.apache.lucene.search.IndexSearcher)22 IndexReader (org.apache.lucene.index.IndexReader)20 Field (org.apache.lucene.document.Field)17 TextField (org.apache.lucene.document.TextField)16 TopDocs (org.apache.lucene.search.TopDocs)16 SpanTermQuery (org.apache.lucene.search.spans.SpanTermQuery)16 TokenStream (org.apache.lucene.analysis.TokenStream)15 BoostQuery (org.apache.lucene.search.BoostQuery)14 IndexWriter (org.apache.lucene.index.IndexWriter)13 ArrayList (java.util.ArrayList)11 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)10