Search in sources :

Example 16 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class MinHashFilterFactoryTests method testDefault.

public void testDefault() throws IOException {
    int default_hash_count = 1;
    int default_bucket_size = 512;
    int default_hash_set_size = 1;
    Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
    String source = "the quick brown fox";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    // with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
    // have enough tokens to fill all the buckets, we still expect 512 tokens.
    assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), default_hash_count * default_bucket_size * default_hash_set_size);
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) ESTestCase(org.elasticsearch.test.ESTestCase) StringReader(java.io.StringReader) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) Settings(org.elasticsearch.common.settings.Settings)

Example 17 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class MinHashFilterFactoryTests method testSettings.

public void testSettings() throws IOException {
    Settings settings = Settings.builder().put("index.analysis.filter.test_min_hash.type", "min_hash").put("index.analysis.filter.test_min_hash.hash_count", "1").put("index.analysis.filter.test_min_hash.bucket_count", "2").put("index.analysis.filter.test_min_hash.hash_set_size", "1").put("index.analysis.filter.test_min_hash.with_rotation", false).put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
    ESTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(settings);
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
    String source = "sushi";
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader(source));
    // despite the fact that bucket_count is 2 and hash_set_size is 1,
    // because with_rotation is false, we only expect 1 token here.
    assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1);
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) ESTestCase(org.elasticsearch.test.ESTestCase) StringReader(java.io.StringReader) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) Settings(org.elasticsearch.common.settings.Settings)

Example 18 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class NGramTokenizerFactoryTests method testBackwardsCompatibilityEdgeNgramTokenFilter.

public void testBackwardsCompatibilityEdgeNgramTokenFilter() throws Exception {
    int iters = scaledRandomIntBetween(20, 100);
    for (int i = 0; i < iters; i++) {
        final Index index = new Index("test", "_na_");
        final String name = "ngr";
        Version v = randomVersion(random());
        Builder builder = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3);
        boolean reverse = random().nextBoolean();
        if (reverse) {
            builder.put("side", "back");
        }
        Settings settings = builder.build();
        Settings indexSettings = newAnalysisSettingsBuilder().put(IndexMetaData.SETTING_VERSION_CREATED, v.id).build();
        Tokenizer tokenizer = new MockTokenizer();
        tokenizer.setReader(new StringReader("foo bar"));
        TokenStream edgeNGramTokenFilter = new EdgeNGramTokenFilterFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create(tokenizer);
        if (reverse) {
            assertThat(edgeNGramTokenFilter, instanceOf(ReverseStringFilter.class));
        } else {
            assertThat(edgeNGramTokenFilter, instanceOf(EdgeNGramTokenFilter.class));
        }
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EdgeNGramTokenFilter(org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter) Builder(org.elasticsearch.common.settings.Settings.Builder) Index(org.elasticsearch.index.Index) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Version(org.elasticsearch.Version) StringReader(java.io.StringReader) ReverseStringFilter(org.apache.lucene.analysis.reverse.ReverseStringFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Settings(org.elasticsearch.common.settings.Settings) IndexSettings(org.elasticsearch.index.IndexSettings)

Example 19 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class NGramTokenizerFactoryTests method testNoTokenChars.

public void testNoTokenChars() throws IOException {
    final Index index = new Index("test", "_na_");
    final String name = "ngr";
    final Settings indexSettings = newAnalysisSettingsBuilder().build();
    final Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 4).putArray("token_chars", new String[0]).build();
    Tokenizer tokenizer = new NGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader("1.34"));
    assertTokenStreamContents(tokenizer, new String[] { "1.", "1.3", "1.34", ".3", ".34", "34" });
}
Also used : StringReader(java.io.StringReader) Index(org.elasticsearch.index.Index) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Settings(org.elasticsearch.common.settings.Settings) IndexSettings(org.elasticsearch.index.IndexSettings)

Example 20 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project elasticsearch by elastic.

the class NGramTokenizerFactoryTests method testPreTokenizationEdge.

public void testPreTokenizationEdge() throws IOException {
    // Make sure that pretokenization works well and that it can be used even with token chars which are supplementary characters
    final Index index = new Index("test", "_na_");
    final String name = "ngr";
    final Settings indexSettings = newAnalysisSettingsBuilder().build();
    Settings settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit").build();
    Tokenizer tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader("Åbc déf g𐐀f "));
    assertTokenStreamContents(tokenizer, new String[] { "Åb", "Åbc", "dé", "déf", "g𐐀", "g𐐀f" });
    settings = newAnalysisSettingsBuilder().put("min_gram", 2).put("max_gram", 3).put("token_chars", "letter,digit,punctuation,whitespace,symbol").build();
    tokenizer = new EdgeNGramTokenizerFactory(IndexSettingsModule.newIndexSettings(index, indexSettings), null, name, settings).create();
    tokenizer.setReader(new StringReader(" a!$ 9"));
    assertTokenStreamContents(tokenizer, new String[] { " a", " a!" });
}
Also used : StringReader(java.io.StringReader) Index(org.elasticsearch.index.Index) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Settings(org.elasticsearch.common.settings.Settings) IndexSettings(org.elasticsearch.index.IndexSettings)

Aggregations

Tokenizer (org.apache.lucene.analysis.Tokenizer)611 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)288 Analyzer (org.apache.lucene.analysis.Analyzer)269 StringReader (java.io.StringReader)264 TokenStream (org.apache.lucene.analysis.TokenStream)245 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)216 Reader (java.io.Reader)91 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)77 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)73 StopFilter (org.apache.lucene.analysis.StopFilter)56 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)55 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)51 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)47 CharArraySet (org.apache.lucene.analysis.CharArraySet)44 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)37 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)35 ESTestCase (org.elasticsearch.test.ESTestCase)30 HashMap (java.util.HashMap)24 TokenFilter (org.apache.lucene.analysis.TokenFilter)24 Random (java.util.Random)20