Search in sources :

Example 31 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class BaseWordDelimiterTokenFilterFactoryTestCase method testPreserveOriginal.

public void testPreserveOriginal() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put("index.analysis.filter.my_word_delimiter.type", type).put("index.analysis.filter.my_word_delimiter.preserve_original", "true").build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[] { "PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil" };
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) ESTestCase(org.elasticsearch.test.ESTestCase) StringReader(java.io.StringReader) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 32 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class MockRepeatAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    Tokenizer tokenizer = new StandardTokenizer();
    TokenStream repeatFilter = new MockRepeatFilter(tokenizer);
    return new TokenStreamComponents(tokenizer, repeatFilter);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer)

Example 33 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project languagetool by languagetool-org.

the class LanguageToolFilterTest method testFilter.

public void testFilter() throws Exception {
    String input = "How to?";
    Tokenizer stream = new AnyCharTokenizer();
    stream.setReader(new StringReader(input));
    LanguageToolFilter filter = new LanguageToolFilter(stream, new JLanguageTool(new English()), false);
    //displayTokensWithFullDetails(filter);
    String start = "_POS_SENT_START";
    assertTokenStreamContents(filter, new String[] { start, "How", "_LEMMA_how", "_POS_WRB", "to", "_LEMMA_to", "_POS_TO", "_LEMMA_to", "_POS_IN", "?", "_POS_SENT_END" }, new int[] { 0, 0, 0, 0, 4, 4, 4, 4, 4, 6, 6 }, new int[] { 0, 3, 3, 3, 6, 6, 6, 6, 6, 7, 7 }, new String[] { "pos", "word", "pos", "pos", "word", "pos", "pos", "pos", "pos", "word", "pos" }, new int[] { 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0 }, 7);
}
Also used : English(org.languagetool.language.English) JLanguageTool(org.languagetool.JLanguageTool) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 34 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project languagetool by languagetool-org.

the class LanguageToolAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String s) {
    Tokenizer tokenizer = new AnyCharTokenizer();
    TokenStream result = new LanguageToolFilter(tokenizer, languageTool, toLowerCase);
    return new TokenStreamComponents(tokenizer, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 35 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project che by eclipse.

the class LuceneSearcher method makeAnalyzer.

protected Analyzer makeAnalyzer() {
    return new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new WhitespaceTokenizer();
            TokenStream filter = new LowerCaseFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Aggregations

Tokenizer (org.apache.lucene.analysis.Tokenizer)611 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)288 Analyzer (org.apache.lucene.analysis.Analyzer)269 StringReader (java.io.StringReader)264 TokenStream (org.apache.lucene.analysis.TokenStream)245 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)216 Reader (java.io.Reader)91 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)77 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)73 StopFilter (org.apache.lucene.analysis.StopFilter)56 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)55 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)51 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)47 CharArraySet (org.apache.lucene.analysis.CharArraySet)44 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)37 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)35 ESTestCase (org.elasticsearch.test.ESTestCase)30 HashMap (java.util.HashMap)24 TokenFilter (org.apache.lucene.analysis.TokenFilter)24 Random (java.util.Random)20