Search in sources :

Example 91 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestCharTokenizers method testCustomMaxTokenLength.

/*
   * tests the max word length passed as parameter - tokenizer will split at the passed position char no matter what happens
   */
public void testCustomMaxTokenLength() throws IOException {
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < 100; i++) {
        builder.append("A");
    }
    Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
    // Tricky, passing two copies of the string to the reader....
    tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT) });
    Exception e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), -1));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
    tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
    tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString(), builder.toString() });
    // Let's test that we can get a token longer than 255 through.
    builder.setLength(0);
    for (int i = 0; i < 500; i++) {
        builder.append("Z");
    }
    tokenizer = new LetterTokenizer(newAttributeFactory(), 500);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
    // Just to be sure what is happening here, token lengths of zero make no sense, 
    // Let's try the edge cases, token > I/O buffer (4096)
    builder.setLength(0);
    for (int i = 0; i < 600; i++) {
        // 600 * 8 = 4800 chars.
        builder.append("aUrOkIjq");
    }
    e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
    tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT) });
    e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 10_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
    tokenizer = new KeywordTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
    e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 2_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 2000000", e.getMessage());
    tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
    e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 3_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 3000000", e.getMessage());
    tokenizer = new WhitespaceTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) StringReader(java.io.StringReader) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) IOException(java.io.IOException)

Example 92 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestTurkishLowerCaseFilter method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new TurkishLowerCaseFilter(tokenizer));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 93 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project lucene-solr by apache.

the class TestElision method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new KeywordTokenizer();
            return new TokenStreamComponents(tokenizer, new ElisionFilter(tokenizer, FrenchAnalyzer.DEFAULT_ARTICLES));
        }
    };
    checkOneTerm(a, "", "");
    a.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) FrenchAnalyzer(org.apache.lucene.analysis.fr.FrenchAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 94 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project jena by apache.

the class LowerCaseKeywordAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    KeywordTokenizer source = new KeywordTokenizer();
    LowerCaseFilter filter = new LowerCaseFilter(source);
    return new TokenStreamComponents(source, filter);
}
Also used : KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 95 with KeywordTokenizer

use of org.apache.lucene.analysis.core.KeywordTokenizer in project cogcomp-nlp by CogComp.

the class WikiURLAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    final Tokenizer source = new KeywordTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new CharacterFilter(result);
    result = new ASCIIFoldingFilter(result);
    result = new LowerCaseFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Aggregations

KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)95 Tokenizer (org.apache.lucene.analysis.Tokenizer)86 Analyzer (org.apache.lucene.analysis.Analyzer)75 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)64 TokenStream (org.apache.lucene.analysis.TokenStream)14 StringReader (java.io.StringReader)11 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)11 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)4 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)4 Random (java.util.Random)3 CharArraySet (org.apache.lucene.analysis.CharArraySet)3 LetterTokenizer (org.apache.lucene.analysis.core.LetterTokenizer)3 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)3 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)3 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)3 Transliterator (com.ibm.icu.text.Transliterator)2 UnicodeSet (com.ibm.icu.text.UnicodeSet)2 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)2 LowerCaseTokenizer (org.apache.lucene.analysis.core.LowerCaseTokenizer)2 RemoveDuplicatesTokenFilter (org.apache.lucene.analysis.miscellaneous.RemoveDuplicatesTokenFilter)2