use of org.elasticsearch.search.suggest.phrase.PhraseSuggestionBuilder in project elasticsearch by elastic.
the class SuggestSearchIT method testSuggestWithManyCandidates.
public void testSuggestWithManyCandidates() throws InterruptedException, ExecutionException, IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder().put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, // A single shard will help to keep the tests repeatable.
1).put("index.analysis.analyzer.text.tokenizer", "standard").putArray("index.analysis.analyzer.text.filter", "lowercase", "my_shingle").put("index.analysis.filter.my_shingle.type", "shingle").put("index.analysis.filter.my_shingle.output_unigrams", true).put("index.analysis.filter.my_shingle.min_shingle_size", 2).put("index.analysis.filter.my_shingle.max_shingle_size", 3));
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("title").field("type", "text").field("analyzer", "text").endObject().endObject().endObject().endObject();
assertAcked(builder.addMapping("type1", mapping));
ensureGreen();
List<String> titles = new ArrayList<>();
// We're going to be searching for:
// united states house of representatives elections in washington 2006
// But we need to make sure we generate a ton of suggestions so we add a bunch of candidates.
// Many of these candidates are drawn from page names on English Wikipedia.
// Tons of different options very near the exact query term
titles.add("United States House of Representatives Elections in Washington 1789");
for (int year = 1790; year < 2014; year += 2) {
titles.add("United States House of Representatives Elections in Washington " + year);
}
// since 0. Why not?
for (int year = 0; year < 2015; year++) {
titles.add(Integer.toString(year));
}
// That ought to provide more less good candidates for the last term
// Now remove or add plural copies of every term we can
titles.add("State");
titles.add("Houses of Parliament");
titles.add("Representative Government");
titles.add("Election");
// Now some possessive
titles.add("Washington's Birthday");
// And some conjugation
titles.add("Unified Modeling Language");
titles.add("Unite Against Fascism");
titles.add("Stated Income Tax");
titles.add("Media organizations housed within colleges");
// And other stuff
titles.add("Untied shoelaces");
titles.add("Unit circle");
titles.add("Untitled");
titles.add("Unicef");
titles.add("Unrated");
titles.add("UniRed");
// Highway in Malaysia
titles.add("Jalan Uniten–Dengkil");
titles.add("UNITAS");
titles.add("UNITER");
titles.add("Un-Led-Ed");
titles.add("STATS LLC");
titles.add("Staples");
titles.add("Skates");
titles.add("Statues of the Liberators");
titles.add("Staten Island");
titles.add("Statens Museum for Kunst");
// The last name or the German word, whichever.
titles.add("Hause");
titles.add("Hose");
titles.add("Hoses");
titles.add("Howse Peak");
titles.add("The Hoose-Gow");
titles.add("Hooser");
titles.add("Electron");
titles.add("Electors");
titles.add("Evictions");
titles.add("Coronal mass ejection");
// A film?
titles.add("Wasington");
// A town in England
titles.add("Warrington");
// Lots of places have this name
titles.add("Waddington");
// Ditto
titles.add("Watlington");
// Yup, also a town
titles.add("Waplington");
// Book
titles.add("Washing of the Spears");
for (char c = 'A'; c <= 'Z'; c++) {
// Can't forget lists, glorious lists!
titles.add("List of former members of the United States House of Representatives (" + c + ")");
// Lots of people are named Washington <Middle Initial>. LastName
titles.add("Washington " + c + ". Lastname");
// Lets just add some more to be evil
titles.add("United " + c);
titles.add("States " + c);
titles.add("House " + c);
titles.add("Elections " + c);
titles.add("2006 " + c);
titles.add(c + " United");
titles.add(c + " States");
titles.add(c + " House");
titles.add(c + " Elections");
titles.add(c + " 2006");
}
List<IndexRequestBuilder> builders = new ArrayList<>();
for (String title : titles) {
builders.add(client().prepareIndex("test", "type1").setSource("title", title));
}
indexRandom(true, builders);
PhraseSuggestionBuilder suggest = phraseSuggestion("title").addCandidateGenerator(candidateGenerator("title").suggestMode("always").maxTermFreq(.99f).size(// Setting a silly high size helps of generate a larger list of candidates for testing.
1000).maxInspections(// This too
1000)).confidence(0f).maxErrors(2f).shardSize(30000).size(30000);
Suggest searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", "title", suggest);
assertSuggestion(searchSuggest, 0, 0, "title", "united states house of representatives elections in washington 2006");
// Just to prove that we've run through a ton of options
assertSuggestionSize(searchSuggest, 0, 25480, "title");
suggest.size(1);
long start = System.currentTimeMillis();
searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", "title", suggest);
long total = System.currentTimeMillis() - start;
assertSuggestion(searchSuggest, 0, 0, "title", "united states house of representatives elections in washington 2006");
// assertThat(total, lessThan(1000L)); // Takes many seconds without fix - just for debugging
}
use of org.elasticsearch.search.suggest.phrase.PhraseSuggestionBuilder in project elasticsearch by elastic.
the class SuggestSearchIT method testSizeParam.
public void testSizeParam() throws IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder().put(SETTING_NUMBER_OF_SHARDS, 1).put("index.analysis.analyzer.reverse.tokenizer", "standard").putArray("index.analysis.analyzer.reverse.filter", "lowercase", "reverse").put("index.analysis.analyzer.body.tokenizer", "standard").putArray("index.analysis.analyzer.body.filter", "lowercase").put("index.analysis.analyzer.bigram.tokenizer", "standard").putArray("index.analysis.analyzer.bigram.filter", "my_shingle", "lowercase").put("index.analysis.filter.my_shingle.type", "shingle").put("index.analysis.filter.my_shingle.output_unigrams", false).put("index.analysis.filter.my_shingle.min_shingle_size", 2).put("index.analysis.filter.my_shingle.max_shingle_size", 2));
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("body").field("type", "text").field("analyzer", "body").endObject().startObject("body_reverse").field("type", "text").field("analyzer", "reverse").endObject().startObject("bigram").field("type", "text").field("analyzer", "bigram").endObject().endObject().endObject().endObject();
assertAcked(builder.addMapping("type1", mapping));
ensureGreen();
String line = "xorr the god jewel";
index("test", "type1", "1", "body", line, "body_reverse", line, "bigram", line);
line = "I got it this time";
index("test", "type1", "2", "body", line, "body_reverse", line, "bigram", line);
refresh();
PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("bigram").realWordErrorLikelihood(0.95f).gramSize(2).analyzer("body").addCandidateGenerator(candidateGenerator("body").minWordLength(1).prefixLength(1).suggestMode("always").size(1).accuracy(0.1f)).smoothingModel(new StupidBackoff(0.1)).maxErrors(1.0f).size(5);
Suggest searchSuggest = searchSuggest("Xorr the Gut-Jewel", "simple_phrase", phraseSuggestion);
assertSuggestionSize(searchSuggest, 0, 0, "simple_phrase");
// we allow a size of 2 now on the shard generator level so "god" will be found since it's LD2
phraseSuggestion.clearCandidateGenerators().addCandidateGenerator(candidateGenerator("body").minWordLength(1).prefixLength(1).suggestMode("always").size(2).accuracy(0.1f));
searchSuggest = searchSuggest("Xorr the Gut-Jewel", "simple_phrase", phraseSuggestion);
assertSuggestion(searchSuggest, 0, "simple_phrase", "xorr the god jewel");
}
use of org.elasticsearch.search.suggest.phrase.PhraseSuggestionBuilder in project elasticsearch by elastic.
the class SuggestSearchIT method testPhraseSuggesterCollate.
public void testPhraseSuggesterCollate() throws InterruptedException, ExecutionException, IOException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder().put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, // A single shard will help to keep the tests repeatable.
1).put("index.analysis.analyzer.text.tokenizer", "standard").putArray("index.analysis.analyzer.text.filter", "lowercase", "my_shingle").put("index.analysis.filter.my_shingle.type", "shingle").put("index.analysis.filter.my_shingle.output_unigrams", true).put("index.analysis.filter.my_shingle.min_shingle_size", 2).put("index.analysis.filter.my_shingle.max_shingle_size", 3));
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("title").field("type", "text").field("analyzer", "text").endObject().endObject().endObject().endObject();
assertAcked(builder.addMapping("type1", mapping));
ensureGreen();
List<String> titles = new ArrayList<>();
titles.add("United States House of Representatives Elections in Washington 2006");
titles.add("United States House of Representatives Elections in Washington 2005");
titles.add("State");
titles.add("Houses of Parliament");
titles.add("Representative Government");
titles.add("Election");
List<IndexRequestBuilder> builders = new ArrayList<>();
for (String title : titles) {
builders.add(client().prepareIndex("test", "type1").setSource("title", title));
}
indexRandom(true, builders);
// suggest without collate
PhraseSuggestionBuilder suggest = phraseSuggestion("title").addCandidateGenerator(new DirectCandidateGeneratorBuilder("title").suggestMode("always").maxTermFreq(.99f).size(10).maxInspections(200)).confidence(0f).maxErrors(2f).shardSize(30000).size(10);
Suggest searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", "title", suggest);
assertSuggestionSize(searchSuggest, 0, 10, "title");
// suggest with collate
String filterString = XContentFactory.jsonBuilder().startObject().startObject("match_phrase").field("{{field}}", "{{suggestion}}").endObject().endObject().string();
PhraseSuggestionBuilder filteredQuerySuggest = suggest.collateQuery(filterString);
filteredQuerySuggest.collateParams(Collections.singletonMap("field", "title"));
searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", "title", filteredQuerySuggest);
assertSuggestionSize(searchSuggest, 0, 2, "title");
// collate suggest with no result (boundary case)
searchSuggest = searchSuggest("Elections of Representatives Parliament", "title", filteredQuerySuggest);
assertSuggestionSize(searchSuggest, 0, 0, "title");
NumShards numShards = getNumShards("test");
// collate suggest with bad query
String incorrectFilterString = XContentFactory.jsonBuilder().startObject().startObject("test").field("title", "{{suggestion}}").endObject().endObject().string();
PhraseSuggestionBuilder incorrectFilteredSuggest = suggest.collateQuery(incorrectFilterString);
Map<String, SuggestionBuilder<?>> namedSuggestion = new HashMap<>();
namedSuggestion.put("my_title_suggestion", incorrectFilteredSuggest);
try {
searchSuggest("united states house of representatives elections in washington 2006", numShards.numPrimaries, namedSuggestion);
fail("Post query error has been swallowed");
} catch (ElasticsearchException e) {
// expected
}
// suggest with collation
String filterStringAsFilter = XContentFactory.jsonBuilder().startObject().startObject("match_phrase").field("title", "{{suggestion}}").endObject().endObject().string();
PhraseSuggestionBuilder filteredFilterSuggest = suggest.collateQuery(filterStringAsFilter);
searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", "title", filteredFilterSuggest);
assertSuggestionSize(searchSuggest, 0, 2, "title");
// collate suggest with bad query
String filterStr = XContentFactory.jsonBuilder().startObject().startObject("pprefix").field("title", "{{suggestion}}").endObject().endObject().string();
PhraseSuggestionBuilder in = suggest.collateQuery(filterStr);
try {
searchSuggest("united states house of representatives elections in washington 2006", numShards.numPrimaries, namedSuggestion);
fail("Post filter error has been swallowed");
} catch (ElasticsearchException e) {
//expected
}
// collate script failure due to no additional params
String collateWithParams = XContentFactory.jsonBuilder().startObject().startObject("{{query_type}}").field("{{query_field}}", "{{suggestion}}").endObject().endObject().string();
PhraseSuggestionBuilder phraseSuggestWithNoParams = suggest.collateQuery(collateWithParams);
try {
searchSuggest("united states house of representatives elections in washington 2006", numShards.numPrimaries, namedSuggestion);
fail("Malformed query (lack of additional params) should fail");
} catch (ElasticsearchException e) {
// expected
}
// collate script with additional params
Map<String, Object> params = new HashMap<>();
params.put("query_type", "match_phrase");
params.put("query_field", "title");
PhraseSuggestionBuilder phraseSuggestWithParams = suggest.collateQuery(collateWithParams).collateParams(params);
searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", "title", phraseSuggestWithParams);
assertSuggestionSize(searchSuggest, 0, 2, "title");
// collate query request with prune set to true
PhraseSuggestionBuilder phraseSuggestWithParamsAndReturn = suggest.collateQuery(collateWithParams).collateParams(params).collatePrune(true);
searchSuggest = searchSuggest("united states house of representatives elections in washington 2006", "title", phraseSuggestWithParamsAndReturn);
assertSuggestionSize(searchSuggest, 0, 10, "title");
assertSuggestionPhraseCollateMatchExists(searchSuggest, "title", 2);
}
use of org.elasticsearch.search.suggest.phrase.PhraseSuggestionBuilder in project elasticsearch by elastic.
the class SuggestSearchIT method testUnmappedField.
public void testUnmappedField() throws IOException, InterruptedException, ExecutionException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder().put(indexSettings()).put("index.analysis.analyzer.biword.tokenizer", "standard").putArray("index.analysis.analyzer.biword.filter", "shingler", "lowercase").put("index.analysis.filter.shingler.type", "shingle").put("index.analysis.filter.shingler.min_shingle_size", 2).put("index.analysis.filter.shingler.max_shingle_size", 3));
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("name").field("type", "text").startObject("fields").startObject("shingled").field("type", "text").field("analyzer", "biword").field("search_analyzer", "standard").endObject().endObject().endObject().endObject().endObject().endObject();
assertAcked(builder.addMapping("type1", mapping));
ensureGreen();
indexRandom(true, client().prepareIndex("test", "type1").setSource("name", "I like iced tea"), client().prepareIndex("test", "type1").setSource("name", "I like tea."), client().prepareIndex("test", "type1").setSource("name", "I like ice cream."));
refresh();
PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("name.shingled").addCandidateGenerator(candidateGenerator("name").prefixLength(0).minWordLength(0).suggestMode("always").maxEdits(2)).gramSize(3);
Suggest searchSuggest = searchSuggest("ice tea", "did_you_mean", phraseSuggestion);
assertSuggestion(searchSuggest, 0, 0, "did_you_mean", "iced tea");
phraseSuggestion = phraseSuggestion("nosuchField").addCandidateGenerator(candidateGenerator("name").prefixLength(0).minWordLength(0).suggestMode("always").maxEdits(2)).gramSize(3);
{
SearchRequestBuilder searchBuilder = client().prepareSearch().setSize(0);
searchBuilder.suggest(new SuggestBuilder().setGlobalText("tetsting sugestion").addSuggestion("did_you_mean", phraseSuggestion));
assertThrows(searchBuilder, SearchPhaseExecutionException.class);
}
{
SearchRequestBuilder searchBuilder = client().prepareSearch().setSize(0);
searchBuilder.suggest(new SuggestBuilder().setGlobalText("tetsting sugestion").addSuggestion("did_you_mean", phraseSuggestion));
assertThrows(searchBuilder, SearchPhaseExecutionException.class);
}
}
use of org.elasticsearch.search.suggest.phrase.PhraseSuggestionBuilder in project elasticsearch by elastic.
the class SuggestSearchIT method testPhraseBoundaryCases.
public void testPhraseBoundaryCases() throws IOException, URISyntaxException {
CreateIndexRequestBuilder builder = prepareCreate("test").setSettings(Settings.builder().put(indexSettings()).put(SETTING_NUMBER_OF_SHARDS, // to get reliable statistics we should put this all into one shard
1).put("index.analysis.analyzer.body.tokenizer", "standard").putArray("index.analysis.analyzer.body.filter", "lowercase").put("index.analysis.analyzer.bigram.tokenizer", "standard").putArray("index.analysis.analyzer.bigram.filter", "my_shingle", "lowercase").put("index.analysis.analyzer.ngram.tokenizer", "standard").putArray("index.analysis.analyzer.ngram.filter", "my_shingle2", "lowercase").put("index.analysis.analyzer.myDefAnalyzer.tokenizer", "standard").putArray("index.analysis.analyzer.myDefAnalyzer.filter", "shingle", "lowercase").put("index.analysis.filter.my_shingle.type", "shingle").put("index.analysis.filter.my_shingle.output_unigrams", false).put("index.analysis.filter.my_shingle.min_shingle_size", 2).put("index.analysis.filter.my_shingle.max_shingle_size", 2).put("index.analysis.filter.my_shingle2.type", "shingle").put("index.analysis.filter.my_shingle2.output_unigrams", true).put("index.analysis.filter.my_shingle2.min_shingle_size", 2).put("index.analysis.filter.my_shingle2.max_shingle_size", 2));
XContentBuilder mapping = XContentFactory.jsonBuilder().startObject().startObject("type1").startObject("properties").startObject("body").field("type", "text").field("analyzer", "body").endObject().startObject("bigram").field("type", "text").field("analyzer", "bigram").endObject().startObject("ngram").field("type", "text").field("analyzer", "ngram").endObject().endObject().endObject().endObject();
assertAcked(builder.addMapping("type1", mapping));
ensureGreen();
String[] strings = new String[] { "Xorr the God-Jewel", "Grog the God-Crusher", "Xorn", "Walter Newell", "Wanda Maximoff", "Captain America", "American Ace", "Wundarr the Aquarian", "Will o' the Wisp", "Xemnu the Titan" };
for (String line : strings) {
index("test", "type1", line, "body", line, "bigram", line, "ngram", line);
}
refresh();
NumShards numShards = getNumShards("test");
// Lets make sure some things throw exceptions
PhraseSuggestionBuilder phraseSuggestion = phraseSuggestion("bigram").analyzer("body").addCandidateGenerator(candidateGenerator("does_not_exist").minWordLength(1).suggestMode("always")).realWordErrorLikelihood(0.95f).maxErrors(0.5f).size(1);
phraseSuggestion.clearCandidateGenerators().analyzer(null);
try {
searchSuggest("xor the got-jewel", numShards.numPrimaries, Collections.singletonMap("simple_phrase", phraseSuggestion));
fail("analyzer does only produce ngrams");
} catch (SearchPhaseExecutionException e) {
}
phraseSuggestion.analyzer("bigram");
try {
searchSuggest("xor the got-jewel", numShards.numPrimaries, Collections.singletonMap("simple_phrase", phraseSuggestion));
fail("analyzer does only produce ngrams");
} catch (SearchPhaseExecutionException e) {
}
// Now we'll make sure some things don't
phraseSuggestion.forceUnigrams(false);
searchSuggest("xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
// Field doesn't produce unigrams but the analyzer does
phraseSuggestion.forceUnigrams(true).analyzer("ngram");
searchSuggest("xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
phraseSuggestion = phraseSuggestion("ngram").analyzer("myDefAnalyzer").forceUnigrams(true).realWordErrorLikelihood(0.95f).maxErrors(0.5f).size(1).addCandidateGenerator(candidateGenerator("body").minWordLength(1).suggestMode("always"));
Suggest suggest = searchSuggest("xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
// "xorr the god jewel" and and "xorn the god jewel" have identical scores (we are only using unigrams to score), so we tie break by
// earlier term (xorn):
assertSuggestion(suggest, 0, "simple_phrase", "xorn the god jewel");
phraseSuggestion.analyzer(null);
suggest = searchSuggest("xor the got-jewel", 0, Collections.singletonMap("simple_phrase", phraseSuggestion));
// In this case xorr has a better score than xorn because we set the field back to the default (my_shingle2) analyzer, so the
// probability that the term is not in the dictionary but is NOT a misspelling is relatively high in this case compared to the
// others that have no n-gram with the other terms in the phrase :) you can set this realWorldErrorLikelyhood
assertSuggestion(suggest, 0, "simple_phrase", "xorr the god jewel");
}
Aggregations