Search in sources :

Example 1 with WhitespaceAnalyzer

use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project elasticsearch by elastic.

the class NoisyChannelSpellCheckerTests method testMultiGenerator.

public void testMultiGenerator() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<>();
    mapping.put("body_ngram", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(tf));
        }
    });
    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            return new TokenStreamComponents(t, new LowerCaseFilter(t));
        }
    });
    mapping.put("body_reverse", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            return new TokenStreamComponents(t, new ReverseStringFilter(new LowerCaseFilter(t)));
        }
    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
    IndexWriterConfig conf = new IndexWriterConfig(wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    String[] strings = new String[] { "Xorr the God-Jewel", "Grog the God-Crusher", "Xorn", "Walter Newell", "Wanda Maximoff", "Captain America", "American Ace", "Wundarr the Aquarian", "Will o' the Wisp", "Xemnu the Titan", "Fantastic Four", "Quasar", "Quasar II" };
    for (String line : strings) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_reverse", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }
    DirectoryReader ir = DirectoryReader.open(writer);
    LaplaceScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator forward = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10);
    DirectCandidateGenerator reverse = new DirectCandidateGenerator(spellchecker, "body_reverse", SuggestMode.SUGGEST_ALWAYS, ir, 0.95, 10, wrapper, wrapper, MultiFields.getTerms(ir, "body_reverse"));
    CandidateGenerator generator = new MultiCandidateGeneratorWrapper(10, forward, reverse);
    Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
    generator = new MultiCandidateGeneratorWrapper(5, forward, reverse);
    corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("american cae"), forward, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    // only use forward with constant prefix
    assertThat(corrections.length, equalTo(0));
    corrections = suggester.getCorrections(wrapper, new BytesRef("america cae"), generator, 2, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("zorr the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("four the god jewel"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Zorr the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 1.5f, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    // Test a special case where one of the suggest term is unchanged by the postFilter, 'II' here is unchanged by the reverse analyzer.
    corrections = suggester.getCorrections(wrapper, new BytesRef("Quazar II"), generator, 1, 1, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("quasar ii"));
}
Also used : HashMap(java.util.HashMap) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) ReverseStringFilter(org.apache.lucene.analysis.reverse.ReverseStringFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) DirectSpellChecker(org.apache.lucene.search.spell.DirectSpellChecker) BytesRef(org.apache.lucene.util.BytesRef) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) DirectoryReader(org.apache.lucene.index.DirectoryReader) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) IndexWriter(org.apache.lucene.index.IndexWriter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 2 with WhitespaceAnalyzer

use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project elasticsearch by elastic.

the class SmoothingModelTestCase method testBuildWordScorer.

/**
     * Test the WordScorer emitted by the smoothing model
     */
public void testBuildWordScorer() throws IOException {
    SmoothingModel testModel = createTestModel();
    Map<String, Analyzer> mapping = new HashMap<>();
    mapping.put("field", new WhitespaceAnalyzer());
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
    IndexWriter writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(wrapper));
    Document doc = new Document();
    doc.add(new Field("field", "someText", TextField.TYPE_NOT_STORED));
    writer.addDocument(doc);
    DirectoryReader ir = DirectoryReader.open(writer);
    WordScorer wordScorer = testModel.buildWordScorerFactory().newScorer(ir, MultiFields.getTerms(ir, "field"), "field", 0.9d, BytesRefs.toBytesRef(" "));
    assertWordScorer(wordScorer, testModel);
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) HashMap(java.util.HashMap) DirectoryReader(org.apache.lucene.index.DirectoryReader) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 3 with WhitespaceAnalyzer

use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project elasticsearch by elastic.

the class CandidateQueryTests method testDuel.

public void testDuel() throws Exception {
    List<Function<String, Query>> queryFunctions = new ArrayList<>();
    queryFunctions.add((id) -> new PrefixQuery(new Term("field", id)));
    queryFunctions.add((id) -> new WildcardQuery(new Term("field", id + "*")));
    queryFunctions.add((id) -> new CustomQuery(new Term("field", id)));
    queryFunctions.add((id) -> new SpanTermQuery(new Term("field", id)));
    queryFunctions.add((id) -> new TermQuery(new Term("field", id)));
    queryFunctions.add((id) -> {
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        return builder.build();
    });
    queryFunctions.add((id) -> {
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.add(new TermQuery(new Term("field", id)), BooleanClause.Occur.MUST);
        if (randomBoolean()) {
            builder.add(new MatchNoDocsQuery("no reason"), BooleanClause.Occur.MUST_NOT);
        }
        if (randomBoolean()) {
            builder.add(new CustomQuery(new Term("field", id)), BooleanClause.Occur.MUST);
        }
        return builder.build();
    });
    queryFunctions.add((id) -> {
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.add(new TermQuery(new Term("field", id)), BooleanClause.Occur.SHOULD);
        if (randomBoolean()) {
            builder.add(new MatchNoDocsQuery("no reason"), BooleanClause.Occur.MUST_NOT);
        }
        if (randomBoolean()) {
            builder.add(new CustomQuery(new Term("field", id)), BooleanClause.Occur.SHOULD);
        }
        return builder.build();
    });
    queryFunctions.add((id) -> {
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST);
        builder.add(new MatchAllDocsQuery(), BooleanClause.Occur.MUST);
        if (randomBoolean()) {
            builder.add(new MatchNoDocsQuery("no reason"), BooleanClause.Occur.MUST_NOT);
        }
        return builder.build();
    });
    queryFunctions.add((id) -> {
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
        builder.add(new MatchAllDocsQuery(), BooleanClause.Occur.SHOULD);
        if (randomBoolean()) {
            builder.add(new MatchNoDocsQuery("no reason"), BooleanClause.Occur.MUST_NOT);
        }
        return builder.build();
    });
    queryFunctions.add((id) -> {
        BooleanQuery.Builder builder = new BooleanQuery.Builder();
        builder.setMinimumNumberShouldMatch(randomIntBetween(0, 4));
        builder.add(new TermQuery(new Term("field", id)), BooleanClause.Occur.SHOULD);
        builder.add(new CustomQuery(new Term("field", id)), BooleanClause.Occur.SHOULD);
        return builder.build();
    });
    queryFunctions.add((id) -> new MatchAllDocsQuery());
    queryFunctions.add((id) -> new MatchNoDocsQuery("no reason at all"));
    int numDocs = randomIntBetween(queryFunctions.size(), queryFunctions.size() * 3);
    List<ParseContext.Document> documents = new ArrayList<>();
    for (int i = 0; i < numDocs; i++) {
        String id = Integer.toString(i);
        Query query = queryFunctions.get(i % queryFunctions.size()).apply(id);
        addQuery(query, documents);
    }
    indexWriter.addDocuments(documents);
    indexWriter.close();
    directoryReader = DirectoryReader.open(directory);
    IndexSearcher shardSearcher = newSearcher(directoryReader);
    // Disable query cache, because ControlQuery cannot be cached...
    shardSearcher.setQueryCache(null);
    for (int i = 0; i < numDocs; i++) {
        String id = Integer.toString(i);
        Iterable<? extends IndexableField> doc = Collections.singleton(new StringField("field", id, Field.Store.NO));
        MemoryIndex memoryIndex = MemoryIndex.fromDocument(doc, new WhitespaceAnalyzer());
        duelRun(queryStore, memoryIndex, shardSearcher);
    }
    Iterable<? extends IndexableField> doc = Collections.singleton(new StringField("field", "value", Field.Store.NO));
    MemoryIndex memoryIndex = MemoryIndex.fromDocument(doc, new WhitespaceAnalyzer());
    duelRun(queryStore, memoryIndex, shardSearcher);
    // Empty percolator doc:
    memoryIndex = new MemoryIndex();
    duelRun(queryStore, memoryIndex, shardSearcher);
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) WildcardQuery(org.apache.lucene.search.WildcardQuery) BlendedTermQuery(org.apache.lucene.queries.BlendedTermQuery) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) CommonTermsQuery(org.apache.lucene.queries.CommonTermsQuery) BlendedTermQuery(org.apache.lucene.queries.BlendedTermQuery) PrefixQuery(org.apache.lucene.search.PrefixQuery) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) WildcardQuery(org.apache.lucene.search.WildcardQuery) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) SpanNotQuery(org.apache.lucene.search.spans.SpanNotQuery) ConstantScoreQuery(org.apache.lucene.search.ConstantScoreQuery) SpanNearQuery(org.apache.lucene.search.spans.SpanNearQuery) TermQuery(org.apache.lucene.search.TermQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) SpanOrQuery(org.apache.lucene.search.spans.SpanOrQuery) MatchNoDocsQuery(org.apache.lucene.search.MatchNoDocsQuery) ArrayList(java.util.ArrayList) Term(org.apache.lucene.index.Term) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) Document(org.apache.lucene.document.Document) LongPoint(org.apache.lucene.document.LongPoint) CheckedFunction(org.elasticsearch.common.CheckedFunction) Function(java.util.function.Function) MemoryIndex(org.apache.lucene.index.memory.MemoryIndex) PrefixQuery(org.apache.lucene.search.PrefixQuery) SpanTermQuery(org.apache.lucene.search.spans.SpanTermQuery) StringField(org.apache.lucene.document.StringField)

Example 4 with WhitespaceAnalyzer

use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project elasticsearch by elastic.

the class PercolateQueryBuilderTests method testCreateMultiDocumentSearcher.

public void testCreateMultiDocumentSearcher() throws Exception {
    int numDocs = randomIntBetween(2, 8);
    List<ParseContext.Document> docs = new ArrayList<>(numDocs);
    for (int i = 0; i < numDocs; i++) {
        docs.add(new ParseContext.Document());
    }
    Analyzer analyzer = new WhitespaceAnalyzer();
    ParsedDocument parsedDocument = new ParsedDocument(null, null, "_id", "_type", null, docs, null, null, null);
    IndexSearcher indexSearcher = PercolateQueryBuilder.createMultiDocumentSearcher(analyzer, parsedDocument);
    assertThat(indexSearcher.getIndexReader().numDocs(), equalTo(numDocs));
    // ensure that any query get modified so that the nested docs are never included as hits:
    Query query = new MatchAllDocsQuery();
    BooleanQuery result = (BooleanQuery) indexSearcher.createNormalizedWeight(query, true).getQuery();
    assertThat(result.clauses().size(), equalTo(2));
    assertThat(result.clauses().get(0).getQuery(), sameInstance(query));
    assertThat(result.clauses().get(0).getOccur(), equalTo(BooleanClause.Occur.MUST));
    assertThat(result.clauses().get(1).getOccur(), equalTo(BooleanClause.Occur.MUST_NOT));
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) IndexSearcher(org.apache.lucene.search.IndexSearcher) BooleanQuery(org.apache.lucene.search.BooleanQuery) Query(org.apache.lucene.search.Query) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) BooleanQuery(org.apache.lucene.search.BooleanQuery) ArrayList(java.util.ArrayList) ParsedDocument(org.elasticsearch.index.mapper.ParsedDocument) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) MatchAllDocsQuery(org.apache.lucene.search.MatchAllDocsQuery) ParsedDocument(org.elasticsearch.index.mapper.ParsedDocument) ParseContext(org.elasticsearch.index.mapper.ParseContext)

Example 5 with WhitespaceAnalyzer

use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project elasticsearch by elastic.

the class PercolateQueryTests method init.

@Before
public void init() throws Exception {
    directory = newDirectory();
    IndexWriterConfig config = new IndexWriterConfig(new WhitespaceAnalyzer());
    config.setMergePolicy(NoMergePolicy.INSTANCE);
    indexWriter = new IndexWriter(directory, config);
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Before(org.junit.Before)

Aggregations

WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)40 IndexWriter (org.apache.lucene.index.IndexWriter)18 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)18 Document (org.apache.lucene.document.Document)16 Analyzer (org.apache.lucene.analysis.Analyzer)11 Test (org.junit.Test)9 ArrayList (java.util.ArrayList)8 NamedList (org.apache.solr.common.util.NamedList)8 IOException (java.io.IOException)7 Token (org.apache.lucene.analysis.Token)7 TextField (org.apache.lucene.document.TextField)7 HashMap (java.util.HashMap)6 IndexSearcher (org.apache.lucene.search.IndexSearcher)6 Field (org.apache.lucene.document.Field)5 DirectoryTaxonomyWriter (org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter)5 DirectoryReader (org.apache.lucene.index.DirectoryReader)5 File (java.io.File)4 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)4 TokenStream (org.apache.lucene.analysis.TokenStream)4 PerFieldAnalyzerWrapper (org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)4