use of org.apache.lucene.search.spell.DirectSpellChecker in project elasticsearch by elastic.
the class TermSuggester method innerExecute.
@Override
public TermSuggestion innerExecute(String name, TermSuggestionContext suggestion, IndexSearcher searcher, CharsRefBuilder spare) throws IOException {
DirectSpellChecker directSpellChecker = suggestion.getDirectSpellCheckerSettings().createDirectSpellChecker();
final IndexReader indexReader = searcher.getIndexReader();
TermSuggestion response = new TermSuggestion(name, suggestion.getSize(), suggestion.getDirectSpellCheckerSettings().sort());
List<Token> tokens = queryTerms(suggestion, spare);
for (Token token : tokens) {
// TODO: Extend DirectSpellChecker in 4.1, to get the raw suggested words as BytesRef
SuggestWord[] suggestedWords = directSpellChecker.suggestSimilar(token.term, suggestion.getShardSize(), indexReader, suggestion.getDirectSpellCheckerSettings().suggestMode());
Text key = new Text(new BytesArray(token.term.bytes()));
TermSuggestion.Entry resultEntry = new TermSuggestion.Entry(key, token.startOffset, token.endOffset - token.startOffset);
for (SuggestWord suggestWord : suggestedWords) {
Text word = new Text(suggestWord.string);
resultEntry.addOption(new TermSuggestion.Entry.Option(word, suggestWord.freq, suggestWord.score));
}
response.addTerm(resultEntry);
}
return response;
}
use of org.apache.lucene.search.spell.DirectSpellChecker in project elasticsearch by elastic.
the class DirectSpellcheckerSettings method createDirectSpellChecker.
public DirectSpellChecker createDirectSpellChecker() {
DirectSpellChecker directSpellChecker = new DirectSpellChecker();
directSpellChecker.setAccuracy(accuracy());
Comparator<SuggestWord> comparator;
switch(sort()) {
case SCORE:
comparator = SCORE_COMPARATOR;
break;
case FREQUENCY:
comparator = LUCENE_FREQUENCY;
break;
default:
throw new IllegalArgumentException("Illegal suggest sort: " + sort());
}
directSpellChecker.setComparator(comparator);
directSpellChecker.setDistance(stringDistance());
directSpellChecker.setMaxEdits(maxEdits());
directSpellChecker.setMaxInspections(maxInspections());
directSpellChecker.setMaxQueryFrequency(maxTermFreq());
directSpellChecker.setMinPrefix(prefixLength());
directSpellChecker.setMinQueryLength(minWordLength());
directSpellChecker.setThresholdFrequency(minDocFreq());
directSpellChecker.setLowerCaseTerms(false);
return directSpellChecker;
}
use of org.apache.lucene.search.spell.DirectSpellChecker in project elasticsearch by elastic.
the class NoisyChannelSpellCheckerTests method testMultiGenerator.
public void testMultiGenerator() throws IOException {
RAMDirectory dir = new RAMDirectory();
Map<String, Analyzer> mapping = new HashMap<>();
mapping.put("body_ngram", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
ShingleFilter tf = new ShingleFilter(t, 2, 3);
tf.setOutputUnigrams(false);
return new TokenStreamComponents(t, new LowerCaseFilter(tf));
}
});
mapping.put("body", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new LowerCaseFilter(t));
}
});
mapping.put("body_reverse", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new ReverseStringFilter(new LowerCaseFilter(t)));
}
});
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
IndexWriterConfig conf = new IndexWriterConfig(wrapper);
IndexWriter writer = new IndexWriter(dir, conf);
String[] strings = new String[] { "Xorr the God-Jewel", "Grog the God-Crusher", "Xorn", "Walter Newell", "Wanda Maximoff", "Captain America", "American Ace", "Wundarr the Aquarian", "Will o' the Wisp", "Xemnu the Titan", "Fantastic Four", "Quasar", "Quasar II" };
for (String line : strings) {
Document doc = new Document();
doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
writer.addDocument(doc);
}
DirectoryReader ir = DirectoryReader.open(writer);
LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
DirectSpellChecker spellchecker = new DirectSpellChecker();
spellchecker.setMinQueryLength(1);
DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10);
DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse"));
CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
// only use forward with constant prefix
assertThat(corrections.length, equalTo(0));
corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
// Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer.
corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii"));
}
use of org.apache.lucene.search.spell.DirectSpellChecker in project OpenGrok by OpenGrok.
the class SearchHelper method prepareExec.
/**
* Create the searcher to use w.r.t. currently set parameters and the given
* projects. Does not produce any {@link #redirect} link. It also does
* nothing if {@link #redirect} or {@link #errorMsg} have a
* none-{@code null} value.
* <p>
* Parameters which should be populated/set at this time:
* <ul>
* <li>{@link #builder}</li> <li>{@link #dataRoot}</li>
* <li>{@link #order} (falls back to relevance if unset)</li>
* <li>{@link #parallel} (default: false)</li> </ul> Populates/sets: <ul>
* <li>{@link #query}</li> <li>{@link #searcher}</li> <li>{@link #sort}</li>
* <li>{@link #projects}</li> <li>{@link #errorMsg} if an error occurs</li>
* </ul>
*
* @param projects project names. If empty, a no-project setup
* is assumed (i.e. DATA_ROOT/index will be used instead of possible
* multiple DATA_ROOT/$project/index). If the set contains projects
* not known in the configuration or projects not yet indexed,
* an error will be returned in {@link #errorMsg}.
* @return this instance
*/
public SearchHelper prepareExec(SortedSet<String> projects) {
if (redirect != null || errorMsg != null) {
return this;
}
// the Query created by the QueryBuilder
try {
indexDir = new File(dataRoot, IndexDatabase.INDEX_DIR);
query = builder.build();
if (projects == null) {
errorMsg = "No project selected!";
return this;
}
this.projects = projects;
if (projects.isEmpty()) {
// no project setup
FSDirectory dir = FSDirectory.open(indexDir.toPath());
searcher = new IndexSearcher(DirectoryReader.open(dir));
closeOnDestroy = true;
} else {
// Check list of project names first to make sure all of them
// are valid and indexed.
closeOnDestroy = false;
Set<String> invalidProjects = projects.stream().filter(proj -> (Project.getByName(proj) == null)).collect(Collectors.toSet());
if (invalidProjects.size() > 0) {
errorMsg = "Project list contains invalid projects: " + String.join(", ", invalidProjects);
return this;
}
Set<Project> notIndexedProjects = projects.stream().map(x -> Project.getByName(x)).filter(proj -> !proj.isIndexed()).collect(Collectors.toSet());
if (notIndexedProjects.size() > 0) {
errorMsg = "Some of the projects to be searched are not indexed yet: " + String.join(", ", notIndexedProjects.stream().map(proj -> proj.getName()).collect(Collectors.toSet()));
return this;
}
// We use MultiReader even for single project. This should
// not matter given that MultiReader is just a cheap wrapper
// around set of IndexReader objects.
MultiReader multireader = RuntimeEnvironment.getInstance().getMultiReader(projects, searcherList);
if (multireader != null) {
searcher = new IndexSearcher(multireader);
} else {
errorMsg = "Failed to initialize search. Check the index.";
return this;
}
}
// Most probably they are not reused. SearcherLifetimeManager might help here.
switch(order) {
case LASTMODIFIED:
sort = new Sort(new SortField(QueryBuilder.DATE, SortField.Type.STRING, true));
break;
case BY_PATH:
sort = new Sort(new SortField(QueryBuilder.FULLPATH, SortField.Type.STRING));
break;
default:
sort = Sort.RELEVANCE;
break;
}
checker = new DirectSpellChecker();
} catch (ParseException e) {
errorMsg = PARSE_ERROR_MSG + e.getMessage();
} catch (FileNotFoundException e) {
// errorMsg = "Index database(s) not found: " + e.getMessage();
errorMsg = "Index database(s) not found.";
} catch (IOException e) {
errorMsg = e.getMessage();
}
return this;
}
use of org.apache.lucene.search.spell.DirectSpellChecker in project OpenGrok by OpenGrok.
the class SearchHelper method prepareExec.
/**
* Create the searcher to use w.r.t. currently set parameters and the given
* projects. Does not produce any {@link #redirect} link. It also does
* nothing if {@link #redirect} or {@link #errorMsg} have a
* none-{@code null} value.
* <p>
* Parameters which should be populated/set at this time:
* <ul>
* <li>{@link #builder}</li> <li>{@link #dataRoot}</li>
* <li>{@link #order} (falls back to relevance if unset)</li>
* </ul>
* Populates/sets:
* <ul>
* <li>{@link #query}</li> <li>{@link #searcher}</li> <li>{@link #sort}</li>
* <li>{@link #projects}</li> <li>{@link #errorMsg} if an error occurs</li>
* </ul>
*
* @param projects project names. If empty, a no-project setup
* is assumed (i.e. DATA_ROOT/index will be used instead of possible
* multiple DATA_ROOT/$project/index). If the set contains projects
* not known in the configuration or projects not yet indexed,
* an error will be returned in {@link #errorMsg}.
* @return this instance
*/
public SearchHelper prepareExec(SortedSet<String> projects) {
if (redirect != null || errorMsg != null) {
return this;
}
settingsHelper = null;
// the Query created by the QueryBuilder
try {
indexDir = new File(dataRoot, IndexDatabase.INDEX_DIR);
query = builder.build();
if (projects == null) {
errorMsg = "No project selected!";
return this;
}
this.projects = projects;
if (projects.isEmpty()) {
// no project setup
FSDirectory dir = FSDirectory.open(indexDir.toPath());
reader = DirectoryReader.open(dir);
searcher = new IndexSearcher(reader);
closeOnDestroy = true;
} else {
// Check list of project names first to make sure all of them
// are valid and indexed.
closeOnDestroy = false;
Set<String> invalidProjects = projects.stream().filter(proj -> (Project.getByName(proj) == null)).collect(Collectors.toSet());
if (!invalidProjects.isEmpty()) {
errorMsg = "Project list contains invalid projects: " + String.join(", ", invalidProjects);
return this;
}
Set<Project> notIndexedProjects = projects.stream().map(Project::getByName).filter(proj -> !proj.isIndexed()).collect(Collectors.toSet());
if (!notIndexedProjects.isEmpty()) {
errorMsg = "Some of the projects to be searched are not indexed yet: " + String.join(", ", notIndexedProjects.stream().map(Project::getName).collect(Collectors.toSet()));
return this;
}
// We use MultiReader even for single project. This should
// not matter given that MultiReader is just a cheap wrapper
// around set of IndexReader objects.
reader = RuntimeEnvironment.getInstance().getMultiReader(projects, searcherList);
if (reader != null) {
searcher = new IndexSearcher(reader);
} else {
errorMsg = "Failed to initialize search. Check the index";
if (!projects.isEmpty()) {
errorMsg += " for projects: " + String.join(", ", projects);
}
return this;
}
}
// Most probably they are not reused. SearcherLifetimeManager might help here.
switch(order) {
case LASTMODIFIED:
sort = new Sort(new SortField(QueryBuilder.DATE, SortField.Type.STRING, true));
break;
case BY_PATH:
sort = new Sort(new SortField(QueryBuilder.FULLPATH, SortField.Type.STRING));
break;
default:
sort = Sort.RELEVANCE;
break;
}
checker = new DirectSpellChecker();
} catch (ParseException e) {
errorMsg = PARSE_ERROR_MSG + e.getMessage();
} catch (FileNotFoundException e) {
errorMsg = "Index database not found. Check the index";
if (!projects.isEmpty()) {
errorMsg += " for projects: " + String.join(", ", projects);
}
errorMsg += "; " + e.getMessage();
} catch (IOException e) {
errorMsg = e.getMessage();
}
return this;
}
Aggregations