use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project elasticsearch by elastic.
the class NoisyChannelSpellCheckerTests method testMultiGenerator.
public void testMultiGenerator() throws IOException {
RAMDirectory dir = new RAMDirectory();
Map<String, Analyzer> mapping = new HashMap<>();
mapping.put("body_ngram", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
ShingleFilter tf = new ShingleFilter(t, 2, 3);
tf.setOutputUnigrams(false);
return new TokenStreamComponents(t, new LowerCaseFilter(tf));
}
});
mapping.put("body", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new LowerCaseFilter(t));
}
});
mapping.put("body_reverse", new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new ReverseStringFilter(new LowerCaseFilter(t)));
}
});
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
IndexWriterConfig conf = new IndexWriterConfig(wrapper);
IndexWriter writer = new IndexWriter(dir, conf);
String[] strings = new String[] { "Xorr the God-Jewel", "Grog the God-Crusher", "Xorn", "Walter Newell", "Wanda Maximoff", "Captain America", "American Ace", "Wundarr the Aquarian", "Will o' the Wisp", "Xemnu the Titan", "Fantastic Four", "Quasar", "Quasar II" };
for (String line : strings) {
Document doc = new Document();
doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED));
doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
writer.addDocument(doc);
}
DirectoryReader ir = DirectoryReader.open(writer);
LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
DirectSpellChecker spellchecker = new DirectSpellChecker();
spellchecker.setMinQueryLength(1);
DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10);
DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse"));
CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);
Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
// only use forward with constant prefix
assertThat(corrections.length, equalTo(0));
corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
assertThat(corrections.length, equalTo(4));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
// Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer.
corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
assertThat(corrections.length, equalTo(1));
assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii"));
}
use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project elasticsearch by elastic.
the class SmoothingModelTestCase method testBuildWordScorer.
/**
* Test the WordScorer emitted by the smoothing model
*/
public void testBuildWordScorer() throws IOException {
SmoothingModel testModel = createTestModel();
Map<String, Analyzer> mapping = new HashMap<>();
mapping.put("field", new WhitespaceAnalyzer());
PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
IndexWriter writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(wrapper));
Document doc = new Document();
doc.add(new Field("field", "someText", TextField.TYPE_NOT_STORED));
writer.addDocument(doc);
DirectoryReader ir = DirectoryReader.open(writer);
WordScorer wordScorer = testModel.buildWordScorerFactory().newScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.9d, BytesRefs.toBytesRef(" "));
assertWordScorer(wordScorer, testModel);
}
use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project elasticsearch by elastic.
the class CandidateQueryTests method testDuel.
public void testDuel() throws Exception {
List<Function<String, Query>> queryFunctions = new ArrayList<>();
queryFunctions.add((id) -> new PrefixQuery(new Term("field", id)));
queryFunctions.add((id) -> new WildcardQuery(new Term("field", id + "*")));
queryFunctions.add((id) -> new CustomQuery(new Term("field", id)));
queryFunctions.add((id) -> new SpanTermQuery(new Term("field", id)));
queryFunctions.add((id) -> new TermQuery(new Term("field", id)));
queryFunctions.add((id) -> {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
return builder.build();
});
queryFunctions.add((id) -> {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new TermQuery(new Term("field", id)), BooleanClause.Occur.MUST);
if (randomBoolean()) {
builder.add(new MatchNoDocsQuery("no reason"), BooleanClause.Occur.MUST_NOT);
}
if (randomBoolean()) {
builder.add(new CustomQuery(new Term("field", id)), BooleanClause.Occur.MUST);
}
return builder.build();
});
queryFunctions.add((id) -> {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new TermQuery(new Term("field", id)), BooleanClause.Occur.SHOULD);
if (randomBoolean()) {
builder.add(new MatchNoDocsQuery("no reason"), BooleanClause.Occur.MUST_NOT);
}
if (randomBoolean()) {
builder.add(new CustomQuery(new Term("field", id)), BooleanClause.Occur.SHOULD);
}
return builder.build();
});
queryFunctions.add((id) -> {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST);
builder.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST);
if (randomBoolean()) {
builder.add(new MatchNoDocsQuery("no reason"), BooleanClause.Occur.MUST_NOT);
}
return builder.build();
});
queryFunctions.add((id) -> {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
builder.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
if (randomBoolean()) {
builder.add(new MatchNoDocsQuery("no reason"), BooleanClause.Occur.MUST_NOT);
}
return builder.build();
});
queryFunctions.add((id) -> {
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.setMinimumNumberShouldMatch(randomIntBetween(0, 4));
builder.add(new TermQuery(new Term("field", id)), BooleanClause.Occur.SHOULD);
builder.add(new CustomQuery(new Term("field", id)), BooleanClause.Occur.SHOULD);
return builder.build();
});
queryFunctions.add((id) -> new MatchAllDocsQuery());
queryFunctions.add((id) -> new MatchNoDocsQuery("no reason at all"));
int numDocs = randomIntBetween(queryFunctions.size(), queryFunctions.size() * 3);
List<ParseContext.Document> documents = new ArrayList<>();
for (int i = 0; i < numDocs; i++) {
String id = Integer.toString(i);
Query query = queryFunctions.get(i % queryFunctions.size()).apply(id);
addQuery(query, documents);
}
indexWriter.addDocuments(documents);
indexWriter.close();
directoryReader = DirectoryReader.open(directory);
IndexSearcher shardSearcher = newSearcher(directoryReader);
// Disable query cache, because ControlQuery cannot be cached...
shardSearcher.setQueryCache(null);
for (int i = 0; i < numDocs; i++) {
String id = Integer.toString(i);
Iterable<? extends IndexableField> doc = Collections.singleton(new StringField("field", id, Field.Store.NO));
MemoryIndex memoryIndex = MemoryIndex.fromDocument(doc, new WhitespaceAnalyzer());
duelRun(queryStore, memoryIndex, shardSearcher);
}
Iterable<? extends IndexableField> doc = Collections.singleton(new StringField("field", "value", Field.Store.NO));
MemoryIndex memoryIndex = MemoryIndex.fromDocument(doc, new WhitespaceAnalyzer());
duelRun(queryStore, memoryIndex, shardSearcher);
// Empty percolator doc:
memoryIndex = new MemoryIndex();
duelRun(queryStore, memoryIndex, shardSearcher);
}
use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project elasticsearch by elastic.
the class PercolateQueryBuilderTests method testCreateMultiDocumentSearcher.
public void testCreateMultiDocumentSearcher() throws Exception {
int numDocs = randomIntBetween(2, 8);
List<ParseContext.Document> docs = new ArrayList<>(numDocs);
for (int i = 0; i < numDocs; i++) {
docs.add(new ParseContext.Document());
}
Analyzer analyzer = new WhitespaceAnalyzer();
ParsedDocument parsedDocument = new ParsedDocument(null, null, "_id", "_type", null, docs, null, null, null);
IndexSearcher indexSearcher = PercolateQueryBuilder.createMultiDocumentSearcher(analyzer, parsedDocument);
assertThat(indexSearcher.getIndexReader().numDocs(), equalTo(numDocs));
// ensure that any query get modified so that the nested docs are never included as hits:
Query query = new MatchAllDocsQuery();
BooleanQuery result = (BooleanQuery) indexSearcher.createNormalizedWeight(query, true).getQuery();
assertThat(result.clauses().size(), equalTo(2));
assertThat(result.clauses().get(0).getQuery(), sameInstance(query));
assertThat(result.clauses().get(0).getOccur(), equalTo(BooleanClause.Occur.MUST));
assertThat(result.clauses().get(1).getOccur(), equalTo(BooleanClause.Occur.MUST_NOT));
}
use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project elasticsearch by elastic.
the class PercolateQueryTests method init.
@Before
public void init() throws Exception {
directory = newDirectory();
IndexWriterConfig config = new IndexWriterConfig(new WhitespaceAnalyzer());
config.setMergePolicy(NoMergePolicy.INSTANCE);
indexWriter = new IndexWriter(directory, config);
}
Aggregations