Search in sources :

Example 1 with SynonymFilter

use of org.apache.lucene.analysis.synonym.SynonymFilter in project elasticsearch by elastic.

the class NoisyChannelSpellCheckerTests method testNgram.

public void testNgram() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<>();
    mapping.put("body_ngram", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(tf));
        }
    });
    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            return new TokenStreamComponents(t, new LowerCaseFilter(t));
        }
    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
    IndexWriterConfig conf = new IndexWriterConfig(wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    String[] strings = new String[] { "Xorr the God-Jewel", "Grog the God-Crusher", "Xorn", "Walter Newell", "Wanda Maximoff", "Captain America", "American Ace", "USA Hero", "Wundarr the Aquarian", "Will o' the Wisp", "Xemnu the Titan", "Fantastic Four", "Quasar", "Quasar II" };
    for (String line : strings) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }
    DirectoryReader ir = DirectoryReader.open(writer);
    WordScorer wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5f);
    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
    Result result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 2);
    Correction[] corrections = result.corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ace"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american <em>ace</em>"));
    assertThat(result.cutoffScore, greaterThan(0d));
    result = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 0, 1);
    corrections = result.corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("american ame"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("american ame"));
    assertThat(result.cutoffScore, equalTo(Double.MIN_VALUE));
    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the <em>god</em> jewel"));
    assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(), equalTo("xor the <em>god</em> jewel"));
    assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn</em> the <em>god</em> jewel"));
    assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr</em> the got jewel"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(space).utf8ToString(), equalTo("xorr the got jewel"));
    // Test some of the highlighting corner cases
    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor teh Got-Jewel"), generator, 4f, 4, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(space).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(space).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(space).utf8ToString(), equalTo("xor teh god jewel"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorr the god</em> jewel"));
    assertThat(corrections[1].join(space, preTag, postTag).utf8ToString(), equalTo("xor <em>the god</em> jewel"));
    assertThat(corrections[2].join(space, preTag, postTag).utf8ToString(), equalTo("<em>xorn the god</em> jewel"));
    assertThat(corrections[3].join(space, preTag, postTag).utf8ToString(), equalTo("xor teh <em>god</em> jewel"));
    // test synonyms
    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            TokenFilter filter = new LowerCaseFilter(t);
            try {
                SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer());
                parser.parse(new StringReader("usa => usa, america, american"));
                filter = new SynonymFilter(filter, parser.build(), true);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            return new TokenStreamComponents(t, filter);
        }
    };
    spellchecker.setAccuracy(0.0f);
    spellchecker.setMinPrefix(1);
    spellchecker.setMinQueryLength(1);
    suggester = new NoisyChannelSpellChecker(0.85);
    wordScorer = new LaplaceScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5f);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections[0].join(space).utf8ToString(), equalTo("captain america"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("<em>captain america</em>"));
    // Make sure that user supplied text is not marked as highlighted in the presence of a synonym filter
    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.85, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
    corrections = suggester.getCorrections(analyzer, new BytesRef("captain usw"), generator, 2, 4, ir, "body", wordScorer, 1, 2).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
    assertThat(corrections[0].join(space, preTag, postTag).utf8ToString(), equalTo("captain <em>america</em>"));
}
Also used : HashMap(java.util.HashMap) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) Result(org.elasticsearch.search.suggest.phrase.NoisyChannelSpellChecker.Result) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) DirectSpellChecker(org.apache.lucene.search.spell.DirectSpellChecker) BytesRef(org.apache.lucene.util.BytesRef) TokenFilter(org.apache.lucene.analysis.TokenFilter) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) DirectoryReader(org.apache.lucene.index.DirectoryReader) SynonymFilter(org.apache.lucene.analysis.synonym.SynonymFilter) RAMDirectory(org.apache.lucene.store.RAMDirectory) IOException(java.io.IOException) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) IndexWriter(org.apache.lucene.index.IndexWriter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) SolrSynonymParser(org.apache.lucene.analysis.synonym.SolrSynonymParser) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 2 with SynonymFilter

use of org.apache.lucene.analysis.synonym.SynonymFilter in project elasticsearch by elastic.

the class NoisyChannelSpellCheckerTests method testTrigram.

public void testTrigram() throws IOException {
    RAMDirectory dir = new RAMDirectory();
    Map<String, Analyzer> mapping = new HashMap<>();
    mapping.put("body_ngram", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            ShingleFilter tf = new ShingleFilter(t, 2, 3);
            tf.setOutputUnigrams(false);
            return new TokenStreamComponents(t, new LowerCaseFilter(tf));
        }
    });
    mapping.put("body", new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            return new TokenStreamComponents(t, new LowerCaseFilter(t));
        }
    });
    PerFieldAnalyzerWrapper wrapper = new PerFieldAnalyzerWrapper(new WhitespaceAnalyzer(), mapping);
    IndexWriterConfig conf = new IndexWriterConfig(wrapper);
    IndexWriter writer = new IndexWriter(dir, conf);
    String[] strings = new String[] { "Xorr the God-Jewel", "Grog the God-Crusher", "Xorn", "Walter Newell", "Wanda Maximoff", "Captain America", "American Ace", "USA Hero", "Wundarr the Aquarian", "Will o' the Wisp", "Xemnu the Titan", "Fantastic Four", "Quasar", "Quasar II" };
    for (String line : strings) {
        Document doc = new Document();
        doc.add(new Field("body", line, TextField.TYPE_NOT_STORED));
        doc.add(new Field("body_ngram", line, TextField.TYPE_NOT_STORED));
        writer.addDocument(doc);
    }
    DirectoryReader ir = DirectoryReader.open(writer);
    WordScorer wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1);
    NoisyChannelSpellChecker suggester = new NoisyChannelSpellChecker();
    DirectSpellChecker spellchecker = new DirectSpellChecker();
    spellchecker.setMinQueryLength(1);
    DirectCandidateGenerator generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 5);
    Correction[] corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 3).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ace"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("american ame"), generator, 1, 1, ir, "body", wordScorer, 1, 1).corrections;
    assertThat(corrections.length, equalTo(0));
    //        assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("american ape"));
    wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.5, 0.4, 0.1);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 0, 3).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 4, ir, "body", wordScorer, 1, 3).corrections;
    assertThat(corrections.length, equalTo(4));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
    assertThat(corrections[2].join(new BytesRef(" ")).utf8ToString(), equalTo("xorn the god jewel"));
    assertThat(corrections[3].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the got jewel"));
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 1, ir, "body", wordScorer, 100, 3).corrections;
    assertThat(corrections.length, equalTo(1));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    // test synonyms
    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer t = new StandardTokenizer();
            TokenFilter filter = new LowerCaseFilter(t);
            try {
                SolrSynonymParser parser = new SolrSynonymParser(true, false, new WhitespaceAnalyzer());
                parser.parse(new StringReader("usa => usa, america, american"));
                filter = new SynonymFilter(filter, parser.build(), true);
            } catch (Exception e) {
                throw new RuntimeException(e);
            }
            return new TokenStreamComponents(t, filter);
        }
    };
    spellchecker.setAccuracy(0.0f);
    spellchecker.setMinPrefix(1);
    spellchecker.setMinQueryLength(1);
    suggester = new NoisyChannelSpellChecker(0.95);
    wordScorer = new LinearInterpolatingScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.95d, new BytesRef(" "), 0.5, 0.4, 0.1);
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usa"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
    generator = new DirectCandidateGenerator(spellchecker, "body", SuggestMode.SUGGEST_MORE_POPULAR, ir, 0.95, 10, null, analyzer, MultiFields.getTerms(ir, "body"));
    corrections = suggester.getCorrections(analyzer, new BytesRef("captian usw"), generator, 2, 4, ir, "body", wordScorer, 1, 3).corrections;
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("captain america"));
    wordScorer = new StupidBackoffScorer(ir, MultiFields.getTerms(ir, "body_ngram"), "body_ngram", 0.85d, new BytesRef(" "), 0.4);
    corrections = suggester.getCorrections(wrapper, new BytesRef("Xor the Got-Jewel"), generator, 0.5f, 2, ir, "body", wordScorer, 0, 3).corrections;
    assertThat(corrections.length, equalTo(2));
    assertThat(corrections[0].join(new BytesRef(" ")).utf8ToString(), equalTo("xorr the god jewel"));
    assertThat(corrections[1].join(new BytesRef(" ")).utf8ToString(), equalTo("xor the god jewel"));
}
Also used : HashMap(java.util.HashMap) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) Document(org.apache.lucene.document.Document) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) DirectSpellChecker(org.apache.lucene.search.spell.DirectSpellChecker) BytesRef(org.apache.lucene.util.BytesRef) TokenFilter(org.apache.lucene.analysis.TokenFilter) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) DirectoryReader(org.apache.lucene.index.DirectoryReader) SynonymFilter(org.apache.lucene.analysis.synonym.SynonymFilter) RAMDirectory(org.apache.lucene.store.RAMDirectory) IOException(java.io.IOException) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) IndexWriter(org.apache.lucene.index.IndexWriter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) SolrSynonymParser(org.apache.lucene.analysis.synonym.SolrSynonymParser) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 3 with SynonymFilter

use of org.apache.lucene.analysis.synonym.SynonymFilter in project lucene-solr by apache.

the class CompletionTokenStreamTest method testWithSynonyms.

@Test
public void testWithSynonyms() throws Exception {
    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
    Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    String input = "mykeyword another keyword";
    tokenStream.setReader(new StringReader(input));
    SynonymFilter filter = new SynonymFilter(tokenStream, builder.build(), true);
    BytesRef payload = new BytesRef("payload");
    CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter, true, false, 100);
    completionTokenStream.setPayload(payload);
    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
    String[] expectedOutputs = new String[2];
    CharsRefBuilder expectedOutput = new CharsRefBuilder();
    expectedOutput.append("mykeyword");
    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
    expectedOutput.append("another");
    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
    expectedOutput.append("keyword");
    expectedOutputs[0] = expectedOutput.toCharsRef().toString();
    expectedOutput.clear();
    expectedOutput.append("mysynonym");
    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
    expectedOutput.append("another");
    expectedOutput.append(((char) CompletionAnalyzer.SEP_LABEL));
    expectedOutput.append("keyword");
    expectedOutputs[1] = expectedOutput.toCharsRef().toString();
    assertTokenStreamContents(stream, expectedOutputs, null, null, new String[] { payload.utf8ToString(), payload.utf8ToString() }, new int[] { 1, 1 }, null, null);
}
Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) SynonymFilter(org.apache.lucene.analysis.synonym.SynonymFilter) CharsRef(org.apache.lucene.util.CharsRef) SynonymMap(org.apache.lucene.analysis.synonym.SynonymMap) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) StringReader(java.io.StringReader) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Example 4 with SynonymFilter

use of org.apache.lucene.analysis.synonym.SynonymFilter in project lucene-solr by apache.

the class CompletionTokenStreamTest method testWithSynonym.

@Test
public void testWithSynonym() throws Exception {
    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("mykeyword"), new CharsRef("mysynonym"), true);
    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    tokenizer.setReader(new StringReader("mykeyword"));
    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);
    CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter);
    BytesRef payload = new BytesRef("payload");
    completionTokenStream.setPayload(payload);
    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
    assertTokenStreamContents(stream, new String[] { "mykeyword", "mysynonym" }, null, null, new String[] { payload.utf8ToString(), payload.utf8ToString() }, new int[] { 1, 1 }, null, null);
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) StringReader(java.io.StringReader) SynonymFilter(org.apache.lucene.analysis.synonym.SynonymFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharsRef(org.apache.lucene.util.CharsRef) BytesRef(org.apache.lucene.util.BytesRef) SynonymMap(org.apache.lucene.analysis.synonym.SynonymMap) Test(org.junit.Test)

Example 5 with SynonymFilter

use of org.apache.lucene.analysis.synonym.SynonymFilter in project lucene-solr by apache.

the class CompletionTokenStreamTest method testValidNumberOfExpansions.

@Test
public void testValidNumberOfExpansions() throws IOException {
    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    for (int i = 0; i < 256; i++) {
        builder.add(new CharsRef("" + (i + 1)), new CharsRef("" + (1000 + (i + 1))), true);
    }
    StringBuilder valueBuilder = new StringBuilder();
    for (int i = 0; i < 8; i++) {
        valueBuilder.append(i + 1);
        valueBuilder.append(" ");
    }
    MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true);
    tokenizer.setReader(new StringReader(valueBuilder.toString()));
    SynonymFilter filter = new SynonymFilter(tokenizer, builder.build(), true);
    CompletionTokenStream completionTokenStream = new CompletionTokenStream(filter);
    completionTokenStream.setPayload(new BytesRef());
    PayloadAttrToTypeAttrFilter stream = new PayloadAttrToTypeAttrFilter(completionTokenStream);
    stream.reset();
    CompletionTokenStream.BytesRefBuilderTermAttribute attr = stream.addAttribute(CompletionTokenStream.BytesRefBuilderTermAttribute.class);
    PositionIncrementAttribute posAttr = stream.addAttribute(PositionIncrementAttribute.class);
    int maxPos = 0;
    int count = 0;
    while (stream.incrementToken()) {
        count++;
        assertNotNull(attr.getBytesRef());
        assertTrue(attr.getBytesRef().length > 0);
        maxPos += posAttr.getPositionIncrement();
    }
    stream.close();
    assertEquals(count, 256);
    assertEquals(count, maxPos);
}
Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) SynonymFilter(org.apache.lucene.analysis.synonym.SynonymFilter) CharsRef(org.apache.lucene.util.CharsRef) SynonymMap(org.apache.lucene.analysis.synonym.SynonymMap) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) StringReader(java.io.StringReader) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Aggregations

SynonymFilter (org.apache.lucene.analysis.synonym.SynonymFilter)8 SynonymMap (org.apache.lucene.analysis.synonym.SynonymMap)6 StringReader (java.io.StringReader)5 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)5 Tokenizer (org.apache.lucene.analysis.Tokenizer)5 BytesRef (org.apache.lucene.util.BytesRef)5 Analyzer (org.apache.lucene.analysis.Analyzer)4 CharsRef (org.apache.lucene.util.CharsRef)4 CharsRefBuilder (org.apache.lucene.util.CharsRefBuilder)4 TokenStream (org.apache.lucene.analysis.TokenStream)3 Test (org.junit.Test)3 IOException (java.io.IOException)2 HashMap (java.util.HashMap)2 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)2 TokenFilter (org.apache.lucene.analysis.TokenFilter)2 WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)2 PerFieldAnalyzerWrapper (org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)2 ShingleFilter (org.apache.lucene.analysis.shingle.ShingleFilter)2 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)2 SolrSynonymParser (org.apache.lucene.analysis.synonym.SolrSynonymParser)2