Search in sources :

Example 31 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestStemmerOverrideFilter method testOverride.

public void testOverride() throws IOException {
    // lets make booked stem to books
    // the override filter will convert "booked" to "books",
    // but also mark it with KeywordAttribute so Porter will not change it.
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder();
    builder.add("booked", "books");
    Tokenizer tokenizer = keywordTokenizer("booked");
    TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, builder.build()));
    assertTokenStreamContents(stream, new String[] { "books" });
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 32 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestScandinavianNormalizationFilter method testEmptyTerm.

/** check that the empty string doesn't cause issues */
public void testEmptyTerm() throws Exception {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new ScandinavianNormalizationFilter(tokenizer));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 33 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestStemmerOverrideFilter method testRandomRealisticKeyword.

public void testRandomRealisticKeyword() throws IOException {
    Map<String, String> map = new HashMap<>();
    int numTerms = atLeast(50);
    for (int i = 0; i < numTerms; i++) {
        String randomRealisticUnicodeString = TestUtil.randomRealisticUnicodeString(random());
        if (randomRealisticUnicodeString.length() > 0) {
            String value = TestUtil.randomSimpleString(random());
            map.put(randomRealisticUnicodeString, value.isEmpty() ? "a" : value);
        }
    }
    if (map.isEmpty()) {
        map.put("booked", "books");
    }
    // This test might fail if ignoreCase is true since the map might have twice the same key, once
    // lowercased and once uppercased
    StemmerOverrideFilter.Builder builder = new StemmerOverrideFilter.Builder(false);
    Set<Entry<String, String>> entrySet = map.entrySet();
    for (Entry<String, String> entry : entrySet) {
        builder.add(entry.getKey(), entry.getValue());
    }
    StemmerOverrideMap build = builder.build();
    for (Entry<String, String> entry : entrySet) {
        if (random().nextBoolean()) {
            Tokenizer tokenizer = new KeywordTokenizer();
            tokenizer.setReader(new StringReader(entry.getKey()));
            TokenStream stream = new PorterStemFilter(new StemmerOverrideFilter(tokenizer, build));
            assertTokenStreamContents(stream, new String[] { entry.getValue() });
        }
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) HashMap(java.util.HashMap) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) StemmerOverrideMap(org.apache.lucene.analysis.miscellaneous.StemmerOverrideFilter.StemmerOverrideMap) Entry(java.util.Map.Entry) StringReader(java.io.StringReader) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 34 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class NGramTokenFilterTest method testEmptyTerm.

public void testEmptyTerm() throws Exception {
    Random random = random();
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new NGramTokenFilter(tokenizer, 2, 15));
        }
    };
    checkAnalysisConsistency(random, a, random.nextBoolean(), "");
    a.close();
}
Also used : Random(java.util.Random) Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 35 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class NGramTokenFilterTest method testSupplementaryCharacters.

public void testSupplementaryCharacters() throws IOException {
    final String s = TestUtil.randomUnicodeString(random(), 10);
    final int codePointCount = s.codePointCount(0, s.length());
    final int minGram = TestUtil.nextInt(random(), 1, 3);
    final int maxGram = TestUtil.nextInt(random(), minGram, 10);
    TokenStream tk = new KeywordTokenizer();
    ((Tokenizer) tk).setReader(new StringReader(s));
    tk = new NGramTokenFilter(tk, minGram, maxGram);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();
    for (int start = 0; start < codePointCount; ++start) {
        for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
            assertTrue(tk.incrementToken());
            assertEquals(0, offsetAtt.startOffset());
            assertEquals(s.length(), offsetAtt.endOffset());
            final int startIndex = Character.offsetByCodePoints(s, 0, start);
            final int endIndex = Character.offsetByCodePoints(s, 0, end);
            assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
        }
    }
    assertFalse(tk.incrementToken());
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Aggregations

KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)95 Tokenizer (org.apache.lucene.analysis.Tokenizer)86 Analyzer (org.apache.lucene.analysis.Analyzer)75 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)64 TokenStream (org.apache.lucene.analysis.TokenStream)14 StringReader (java.io.StringReader)11 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)11 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)4 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)4 Random (java.util.Random)3 CharArraySet (org.apache.lucene.analysis.CharArraySet)3 LetterTokenizer (org.apache.lucene.analysis.core.LetterTokenizer)3 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)3 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)3 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)3 Transliterator (com.ibm.icu.text.Transliterator)2 UnicodeSet (com.ibm.icu.text.UnicodeSet)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 LowerCaseTokenizer (org.apache.lucene.analysis.core.LowerCaseTokenizer)2 RemoveDuplicatesTokenFilter (org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter)2