Search in sources :

Example 11 with WhitespaceTokenizer

use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project elasticsearch by elastic.

the class BaseWordDelimiterTokenFilterFactoryTestCase method testPreserveOriginal.

public void testPreserveOriginal() throws IOException {
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).put("index.analysis.filter.my_word_delimiter.type", type).put("index.analysis.filter.my_word_delimiter.preserve_original", "true").build());
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_word_delimiter");
    String source = "PowerShot 500-42 wi-fi wi-fi-4000 j2se O'Neil's";
    String[] expected = new String[] { "PowerShot", "Power", "Shot", "500-42", "500", "42", "wi-fi", "wi", "fi", "wi-fi-4000", "wi", "fi", "4000", "j2se", "j", "2", "se", "O'Neil's", "O", "Neil" };
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) ESTestCase(org.elasticsearch.test.ESTestCase) StringReader(java.io.StringReader) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 12 with WhitespaceTokenizer

use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project che by eclipse.

the class LuceneSearcher method makeAnalyzer.

protected Analyzer makeAnalyzer() {
    return new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new WhitespaceTokenizer();
            TokenStream filter = new LowerCaseFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 13 with WhitespaceTokenizer

use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project lucene-solr by apache.

the class ShingleFilterTest method testReset.

public void testReset() throws Exception {
    Tokenizer wsTokenizer = new WhitespaceTokenizer();
    wsTokenizer.setReader(new StringReader("please divide this sentence"));
    TokenStream filter = new ShingleFilter(wsTokenizer, 2);
    assertTokenStreamContents(filter, new String[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new String[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
    wsTokenizer.setReader(new StringReader("please divide this sentence"));
    assertTokenStreamContents(filter, new String[] { "please", "please divide", "divide", "divide this", "this", "this sentence", "sentence" }, new int[] { 0, 0, 7, 7, 14, 14, 19 }, new int[] { 6, 13, 13, 18, 18, 27, 27 }, new String[] { TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE, "shingle", TypeAttribute.DEFAULT_TYPE }, new int[] { 1, 0, 1, 0, 1, 0, 1 });
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) StringReader(java.io.StringReader) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 14 with WhitespaceTokenizer

use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project lucene-solr by apache.

the class SynonymFilterFactory method inform.

@Override
public void inform(ResourceLoader loader) throws IOException {
    final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);
    Analyzer analyzer;
    if (analyzerName != null) {
        analyzer = loadAnalyzer(loader, analyzerName);
    } else {
        analyzer = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer() : factory.create();
                TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
                return new TokenStreamComponents(tokenizer, stream);
            }
        };
    }
    try (Analyzer a = analyzer) {
        String formatClass = format;
        if (format == null || format.equals("solr")) {
            formatClass = SolrSynonymParser.class.getName();
        } else if (format.equals("wordnet")) {
            formatClass = WordnetSynonymParser.class.getName();
        }
        // TODO: expose dedup as a parameter?
        map = loadSynonyms(loader, formatClass, true, a);
    } catch (ParseException e) {
        throw new IOException("Error parsing synonyms file:", e);
    }
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) ParseException(java.text.ParseException) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) LowerCaseFilter(org.apache.lucene.analysis.LowerCaseFilter)

Example 15 with WhitespaceTokenizer

use of org.apache.lucene.analysis.core.WhitespaceTokenizer in project lucene-solr by apache.

the class SynonymGraphFilterFactory method inform.

@Override
public void inform(ResourceLoader loader) throws IOException {
    final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);
    Analyzer analyzer;
    if (analyzerName != null) {
        analyzer = loadAnalyzer(loader, analyzerName);
    } else {
        analyzer = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer() : factory.create();
                TokenStream stream = ignoreCase ? new LowerCaseFilter(tokenizer) : tokenizer;
                return new TokenStreamComponents(tokenizer, stream);
            }
        };
    }
    try (Analyzer a = analyzer) {
        String formatClass = format;
        if (format == null || format.equals("solr")) {
            formatClass = SolrSynonymParser.class.getName();
        } else if (format.equals("wordnet")) {
            formatClass = WordnetSynonymParser.class.getName();
        }
        // TODO: expose dedup as a parameter?
        map = loadSynonyms(loader, formatClass, true, a);
    } catch (ParseException e) {
        throw new IOException("Error parsing synonyms file:", e);
    }
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) TokenizerFactory(org.apache.lucene.analysis.util.TokenizerFactory) ParseException(java.text.ParseException) IOException(java.io.IOException) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Aggregations

WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)44 Tokenizer (org.apache.lucene.analysis.Tokenizer)38 StringReader (java.io.StringReader)37 ESTestCase (org.elasticsearch.test.ESTestCase)25 TokenStream (org.apache.lucene.analysis.TokenStream)16 Settings (org.elasticsearch.common.settings.Settings)8 Analyzer (org.apache.lucene.analysis.Analyzer)4 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)4 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)4 IOException (java.io.IOException)3 HashMap (java.util.HashMap)3 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)3 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)3 ParseException (java.text.ParseException)2 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)2 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)2 StopFilter (org.apache.lucene.analysis.StopFilter)2 TokenizerFactory (org.apache.lucene.analysis.util.TokenizerFactory)2 SuggestStopFilter (org.apache.lucene.search.suggest.analyzing.SuggestStopFilter)2 Version (org.elasticsearch.Version)2