Search in sources :

Example 6 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class SimpleIcuCollationTokenFilterTests method assertCollation.

private void assertCollation(TokenFilterFactory factory, String string1, String string2, int comparison) throws IOException {
    Tokenizer tokenizer = new KeywordTokenizer();
    tokenizer.setReader(new StringReader(string1));
    TokenStream stream1 = factory.create(tokenizer);
    tokenizer = new KeywordTokenizer();
    tokenizer.setReader(new StringReader(string2));
    TokenStream stream2 = factory.create(tokenizer);
    assertCollation(stream1, stream2, comparison);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StringReader(java.io.StringReader) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 7 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class KuromojiAnalysisTests method testBaseFormFilterFactory.

public void testBaseFormFilterFactory() throws IOException {
    TestAnalysis analysis = createTestAnalysis();
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_pos");
    assertThat(tokenFilter, instanceOf(KuromojiPartOfSpeechFilterFactory.class));
    String source = "私は制限スピードを超える。";
    String[] expected = new String[] { "私", "は", "制限", "スピード", "を" };
    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
Also used : StringReader(java.io.StringReader) JapaneseTokenizer(org.apache.lucene.analysis.ja.JapaneseTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) JapaneseTokenizer(org.apache.lucene.analysis.ja.JapaneseTokenizer)

Example 8 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class KuromojiAnalysisTests method testKatakanaStemFilter.

public void testKatakanaStemFilter() throws IOException {
    TestAnalysis analysis = createTestAnalysis();
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("kuromoji_stemmer");
    assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class));
    String source = "明後日パーティーに行く予定がある。図書館で資料をコピーしました。";
    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    // パーティー should be stemmed by default
    // (min len) コピー should not be stemmed
    String[] expected_tokens_katakana = new String[] { "明後日", "パーティ", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" };
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
    tokenFilter = analysis.tokenFilter.get("kuromoji_ks");
    assertThat(tokenFilter, instanceOf(KuromojiKatakanaStemmerFactory.class));
    tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    // パーティー should not be stemmed since min len == 6
    // コピー should not be stemmed
    expected_tokens_katakana = new String[] { "明後日", "パーティー", "に", "行く", "予定", "が", "ある", "図書館", "で", "資料", "を", "コピー", "し", "まし", "た" };
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected_tokens_katakana);
}
Also used : StringReader(java.io.StringReader) JapaneseTokenizer(org.apache.lucene.analysis.ja.JapaneseTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) JapaneseTokenizer(org.apache.lucene.analysis.ja.JapaneseTokenizer)

Example 9 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class KuromojiAnalysisTests method testKuromojiUserDict.

public void testKuromojiUserDict() throws IOException {
    TestAnalysis analysis = createTestAnalysis();
    TokenizerFactory tokenizerFactory = analysis.tokenizer.get("kuromoji_user_dict");
    String source = "私は制限スピードを超える。";
    String[] expected = new String[] { "私", "は", "制限スピード", "を", "超える" };
    Tokenizer tokenizer = tokenizerFactory.create();
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenizer, expected);
}
Also used : StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) JapaneseTokenizer(org.apache.lucene.analysis.ja.JapaneseTokenizer)

Example 10 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class KuromojiAnalysisTests method testJapaneseStopFilterFactory.

public void testJapaneseStopFilterFactory() throws IOException {
    TestAnalysis analysis = createTestAnalysis();
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("ja_stop");
    assertThat(tokenFilter, instanceOf(JapaneseStopTokenFilterFactory.class));
    String source = "私は制限スピードを超える。";
    String[] expected = new String[] { "私", "制限", "超える" };
    Tokenizer tokenizer = new JapaneseTokenizer(null, true, JapaneseTokenizer.Mode.SEARCH);
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
Also used : StringReader(java.io.StringReader) JapaneseTokenizer(org.apache.lucene.analysis.ja.JapaneseTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) JapaneseTokenizer(org.apache.lucene.analysis.ja.JapaneseTokenizer)

Aggregations

Tokenizer (org.apache.lucene.analysis.Tokenizer)611 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)288 Analyzer (org.apache.lucene.analysis.Analyzer)269 StringReader (java.io.StringReader)264 TokenStream (org.apache.lucene.analysis.TokenStream)245 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)216 Reader (java.io.Reader)91 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)77 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)73 StopFilter (org.apache.lucene.analysis.StopFilter)56 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)55 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)51 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)47 CharArraySet (org.apache.lucene.analysis.CharArraySet)44 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)37 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)35 ESTestCase (org.elasticsearch.test.ESTestCase)30 HashMap (java.util.HashMap)24 TokenFilter (org.apache.lucene.analysis.TokenFilter)24 Random (java.util.Random)20