Search in sources :

Example 76 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.

the class HTMLStripCharFilterTest method testUTF16Surrogates.

public void testUTF16Surrogates() throws Exception {
    Analyzer analyzer = newTestAnalyzer();
    // Paired surrogates
    assertAnalyzesTo(analyzer, " one two ��three", new String[] { "one", "two", "𫀁three" });
    assertAnalyzesTo(analyzer, " ��", new String[] { "𫀁" });
    assertAnalyzesTo(analyzer, " ��", new String[] { "𫀁" });
    assertAnalyzesTo(analyzer, " ��", new String[] { "𫀁" });
    // Improperly paired surrogates
    assertAnalyzesTo(analyzer, " �", new String[] { "�" });
    assertAnalyzesTo(analyzer, " �", new String[] { "�" });
    assertAnalyzesTo(analyzer, " 훚�", new String[] { "훚�" });
    assertAnalyzesTo(analyzer, " 훚�", new String[] { "훚�" });
    // Unpaired high surrogates
    assertAnalyzesTo(analyzer, " �", new String[] { "�" });
    assertAnalyzesTo(analyzer, " &#Xd921", new String[] { "�" });
    assertAnalyzesTo(analyzer, " &#Xd921<br>", new String[] { "&#Xd921" });
    assertAnalyzesTo(analyzer, " &#55528;", new String[] { "�" });
    assertAnalyzesTo(analyzer, " &#55528", new String[] { "�" });
    assertAnalyzesTo(analyzer, " &#55528<br>", new String[] { "&#55528" });
    // Unpaired low surrogates
    assertAnalyzesTo(analyzer, " &#xdfdb;", new String[] { "�" });
    assertAnalyzesTo(analyzer, " &#xdfdb", new String[] { "�" });
    assertAnalyzesTo(analyzer, " &#xdfdb<br>", new String[] { "&#xdfdb" });
    assertAnalyzesTo(analyzer, " &#57209;", new String[] { "�" });
    assertAnalyzesTo(analyzer, " &#57209", new String[] { "�" });
    assertAnalyzesTo(analyzer, " &#57209<br>", new String[] { "&#57209" });
    analyzer.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer)

Example 77 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.

the class TestArabicNormalizationFilter method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new ArabicNormalizationFilter(tokenizer));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 78 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.

the class TestBulgarianAnalyzer method testBasicExamples.

/**
   * Test some examples from the paper
   */
public void testBasicExamples() throws IOException {
    Analyzer a = new BulgarianAnalyzer();
    assertAnalyzesTo(a, "енергийни кризи", new String[] { "енергийн", "криз" });
    assertAnalyzesTo(a, "Атомната енергия", new String[] { "атомн", "енерг" });
    assertAnalyzesTo(a, "компютри", new String[] { "компютр" });
    assertAnalyzesTo(a, "компютър", new String[] { "компютр" });
    assertAnalyzesTo(a, "градове", new String[] { "град" });
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer)

Example 79 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.

the class TestBrazilianAnalyzer method testReusableTokenStream.

public void testReusableTokenStream() throws Exception {
    Analyzer a = new BrazilianAnalyzer();
    checkReuse(a, "boa", "boa");
    checkReuse(a, "boainain", "boainain");
    checkReuse(a, "boas", "boas");
    // removes diacritic: different from snowball portugese
    checkReuse(a, "bôas", "boas");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer)

Example 80 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.

the class TestBrazilianAnalyzer method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new BrazilianStemFilter(tokenizer));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)1020 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)396 Tokenizer (org.apache.lucene.analysis.Tokenizer)265 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)228 Document (org.apache.lucene.document.Document)207 Directory (org.apache.lucene.store.Directory)192 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)176 BytesRef (org.apache.lucene.util.BytesRef)122 Test (org.junit.Test)119 TokenStream (org.apache.lucene.analysis.TokenStream)107 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)92 Term (org.apache.lucene.index.Term)92 IndexReader (org.apache.lucene.index.IndexReader)67 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)65 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)64 Input (org.apache.lucene.search.suggest.Input)63 CharArraySet (org.apache.lucene.analysis.CharArraySet)58 ArrayList (java.util.ArrayList)57 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)57 TextField (org.apache.lucene.document.TextField)55