use of org.apache.lucene.search.PhraseQuery in project greplin-lucene-utils by Cue.
the class PhraseFilterBenchmark method main.
public static void main(String[] argv) {
Directory directory = new RAMDirectory();
try {
IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(Version.LUCENE_32, new WhitespaceAnalyzer(Version.LUCENE_32)));
int done = 0;
for (int i = 0; i < NUMBER_OF_SEGMENTS; i++) {
int remaining = NUMBER_OF_SEGMENTS - i;
int numberOfDocs;
if (remaining == 1) {
numberOfDocs = TOTAL_DOCS - done;
} else {
numberOfDocs = RANDOM.nextInt(TOTAL_DOCS - done - remaining) + 1;
}
done += numberOfDocs;
System.out.println("Segment #" + i + " has " + numberOfDocs + " docs");
for (int d = 0; d < numberOfDocs; d++) {
int wordCount = RANDOM.nextInt(WORDS_PER_DOC_DEVIATION * 2) + AVERAGE_WORDS_PER_DOC - WORDS_PER_DOC_DEVIATION;
Document doc = new Document();
doc.add(new Field("f", Joiner.on(' ').join(words(wordCount)), Field.Store.YES, Field.Index.ANALYZED));
doc.add(new Field("second", RANDOM.nextInt(100) < SECOND_FIELD_MATCH_PERCENTAGE ? "yes" : "no", Field.Store.NO, Field.Index.ANALYZED));
writer.addDocument(doc);
}
writer.commit();
}
writer.close();
IndexReader reader = IndexReader.open(directory);
IndexSearcher searcher = new IndexSearcher(reader);
String[][] queries = new String[TOTAL_QUERIES][];
Term[][] terms = new Term[TOTAL_QUERIES][];
for (int q = 0; q < TOTAL_QUERIES; q++) {
queries[q] = words(WORDS_PER_QUERY[RANDOM.nextInt(WORDS_PER_QUERY.length)]);
terms[q] = new Term[queries[q].length];
for (int qw = 0; qw < queries[q].length; qw++) {
terms[q][qw] = new Term(FIELD, queries[q][qw]);
}
}
// Warm up.
new PhraseFilter(FIELD, queries[0]).getDocIdSet(reader);
for (int round = 0; round < ROUNDS; round++) {
System.out.println();
String name1 = "filter";
String name2 = "query";
long ms1 = 0, ms2 = 0;
for (int step = 0; step < 2; step++) {
System.gc();
System.gc();
System.gc();
if (step == (round & 1)) {
long millis = System.currentTimeMillis();
long hits = 0;
for (String[] queryWords : queries) {
PhraseFilter pf = new PhraseFilter(new FilterIntersectionProvider(TermsFilter.from(new Term("second", "yes"))), FIELD, queryWords);
hits += searcher.search(new FilteredQuery(new MatchAllDocsQuery(), pf), 1).totalHits;
}
ms1 = System.currentTimeMillis() - millis;
System.out.println("Finished " + name1 + " in " + ms1 + "ms with " + hits + " hits");
} else {
long millis = System.currentTimeMillis();
long hits = 0;
for (Term[] queryTerms : terms) {
PhraseQuery pq = new PhraseQuery();
for (Term term : queryTerms) {
pq.add(term);
}
Query query = BooleanQueryBuilder.builder().must(new TermQuery(new Term("second", "yes"))).must(pq).build();
hits += searcher.search(query, 1).totalHits;
}
ms2 = System.currentTimeMillis() - millis;
System.out.println("Finished " + name2 + " in " + ms2 + "ms with " + hits + " hits");
}
}
System.out.println(name1 + " took " + (int) ((100.0 * ms1) / ms2) + "% as much time as " + name2);
}
} catch (IOException e) {
e.printStackTrace();
}
}
use of org.apache.lucene.search.PhraseQuery in project crate by crate.
the class MultiMatchQuery method blendPhrase.
/**
* Expand a {@link PhraseQuery} to multiple fields that share the same analyzer.
* Returns a {@link DisjunctionMaxQuery} with a disjunction for each expanded field.
*/
static Query blendPhrase(PhraseQuery query, float tiebreaker, FieldAndFieldType... fields) {
List<Query> disjunctions = new ArrayList<>();
for (FieldAndFieldType field : fields) {
int[] positions = query.getPositions();
Term[] terms = query.getTerms();
PhraseQuery.Builder builder = new PhraseQuery.Builder();
for (int i = 0; i < terms.length; i++) {
builder.add(new Term(field.fieldType.name(), terms[i].bytes()), positions[i]);
}
Query q = builder.build();
if (field.boost != MultiMatchQuery.DEFAULT_BOOST) {
q = new BoostQuery(q, field.boost);
}
disjunctions.add(q);
}
return new DisjunctionMaxQuery(disjunctions, tiebreaker);
}
use of org.apache.lucene.search.PhraseQuery in project zm-mailbox by Zimbra.
the class AbstractIndexStoreTest method phraseQueryWithStopWord.
@Test
public void phraseQueryWithStopWord() throws Exception {
ZimbraLog.test.debug("--->TEST phraseQueryWithStopWord");
Mailbox mbox = MailboxManager.getInstance().getMailboxByAccountId(MockProvisioning.DEFAULT_ACCOUNT_ID);
createContact(mbox, "Non", "Match", "nOn.MaTchiNg@zimbra.com");
Contact contact2 = createContact(mbox, "First", "Last", "f.last@zimbra.com", "1066 and all that with William the conqueror and others");
createContact(mbox, "Given", "Surname", "GiV.SurN@zimbra.com");
// Make sure all indexing has been done
mbox.index.indexDeferredItems();
IndexStore index = mbox.index.getIndexStore();
ZimbraIndexSearcher searcher = index.openSearcher();
PhraseQuery pquery = new PhraseQuery();
// Lower case required for each term for Lucene
pquery.add(new Term(LuceneFields.L_CONTENT, "william"));
// pquery.add(new Term(LuceneFields.L_CONTENT, "the")); - excluded because it is a stop word
pquery.add(new Term(LuceneFields.L_CONTENT, "conqueror"));
ZimbraTopDocs result = searcher.search(pquery, 100);
Assert.assertNotNull("searcher.search result object", result);
ZimbraLog.test.debug("Result for search [hits=%d]:%s", result.getTotalHits(), result.toString());
Assert.assertEquals("Number of hits", 1, result.getTotalHits());
String expected1Id = String.valueOf(contact2.getId());
String match1Id = getBlobIdForResultDoc(searcher, result, 0);
Assert.assertEquals("Mailbox Blob ID of match", expected1Id, match1Id);
}
use of org.apache.lucene.search.PhraseQuery in project textdb by TextDB.
the class KeywordMatcherSourceOperator method buildPhraseQuery.
private Query buildPhraseQuery() throws DataflowException {
BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder();
for (String attributeName : this.predicate.getAttributeNames()) {
AttributeType attributeType = this.inputSchema.getAttribute(attributeName).getType();
// types other than TEXT and STRING: throw Exception for now
if (attributeType != AttributeType.STRING && attributeType != AttributeType.TEXT) {
throw new DataflowException("KeywordPredicate: Fields other than STRING and TEXT are not supported yet");
}
if (attributeType == AttributeType.STRING) {
Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery()));
booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
}
if (attributeType == AttributeType.TEXT) {
if (queryTokenList.size() == 1) {
Query termQuery = new TermQuery(new Term(attributeName, predicate.getQuery().toLowerCase()));
booleanQueryBuilder.add(termQuery, BooleanClause.Occur.SHOULD);
} else {
PhraseQuery.Builder phraseQueryBuilder = new PhraseQuery.Builder();
for (int i = 0; i < queryTokensWithStopwords.size(); i++) {
if (!StandardAnalyzer.STOP_WORDS_SET.contains(queryTokensWithStopwords.get(i))) {
phraseQueryBuilder.add(new Term(attributeName, queryTokensWithStopwords.get(i).toLowerCase()), i);
}
}
PhraseQuery phraseQuery = phraseQueryBuilder.build();
booleanQueryBuilder.add(phraseQuery, BooleanClause.Occur.SHOULD);
}
}
}
return booleanQueryBuilder.build();
}
use of org.apache.lucene.search.PhraseQuery in project elasticsearch by elastic.
the class QueryStringQueryBuilderTests method testToQueryPhraseQuery.
public void testToQueryPhraseQuery() throws IOException {
assumeTrue("test runs only when at least a type is registered", getCurrentTypes().length > 0);
Query query = queryStringQuery("\"term1 term2\"").defaultField(STRING_FIELD_NAME).phraseSlop(3).toQuery(createShardContext());
assertThat(query, instanceOf(DisjunctionMaxQuery.class));
DisjunctionMaxQuery disjunctionMaxQuery = (DisjunctionMaxQuery) query;
assertThat(disjunctionMaxQuery.getDisjuncts().size(), equalTo(1));
assertThat(disjunctionMaxQuery.getDisjuncts().get(0), instanceOf(PhraseQuery.class));
PhraseQuery phraseQuery = (PhraseQuery) disjunctionMaxQuery.getDisjuncts().get(0);
assertThat(phraseQuery.getTerms().length, equalTo(2));
assertThat(phraseQuery.getTerms()[0], equalTo(new Term(STRING_FIELD_NAME, "term1")));
assertThat(phraseQuery.getTerms()[1], equalTo(new Term(STRING_FIELD_NAME, "term2")));
assertThat(phraseQuery.getSlop(), equalTo(3));
}
Aggregations