Search in sources :

Example 1 with LowerCaseTokenizer

use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.

the class TestCharTokenizers method testMaxWordLength.

/*
   * tests the max word length of 255 - tokenizer will split at the 255 char no matter what happens
   */
public void testMaxWordLength() throws IOException {
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < 255; i++) {
        builder.append("A");
    }
    Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
    tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT) });
}
Also used : LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) StringReader(java.io.StringReader) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer)

Example 2 with LowerCaseTokenizer

use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.

the class TestCharTokenizers method testMaxWordLengthWithSupplementary.

/*
   * tests the max word length of 255 with a surrogate pair at position 255
   */
public void testMaxWordLengthWithSupplementary() throws IOException {
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < 254; i++) {
        builder.append("A");
    }
    builder.append("𐐜");
    Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
    tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT) });
}
Also used : LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) StringReader(java.io.StringReader) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer)

Example 3 with LowerCaseTokenizer

use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.

the class TestGermanAnalyzer method testWithKeywordAttribute.

public void testWithKeywordAttribute() throws IOException {
    CharArraySet set = new CharArraySet(1, true);
    set.add("fischen");
    final LowerCaseTokenizer in = new LowerCaseTokenizer();
    in.setReader(new StringReader("Fischen Trinken"));
    GermanStemFilter filter = new GermanStemFilter(new SetKeywordMarkerFilter(in, set));
    assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StringReader(java.io.StringReader)

Example 4 with LowerCaseTokenizer

use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.

the class TestCharTokenizers method testCustomMaxTokenLength.

/*
   * tests the max word length passed as parameter - tokenizer will split at the passed position char no matter what happens
   */
public void testCustomMaxTokenLength() throws IOException {
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < 100; i++) {
        builder.append("A");
    }
    Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
    // Tricky, passing two copies of the string to the reader....
    tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT) });
    Exception e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), -1));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
    tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
    tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString(), builder.toString() });
    // Let's test that we can get a token longer than 255 through.
    builder.setLength(0);
    for (int i = 0; i < 500; i++) {
        builder.append("Z");
    }
    tokenizer = new LetterTokenizer(newAttributeFactory(), 500);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
    // Just to be sure what is happening here, token lengths of zero make no sense, 
    // Let's try the edge cases, token > I/O buffer (4096)
    builder.setLength(0);
    for (int i = 0; i < 600; i++) {
        // 600 * 8 = 4800 chars.
        builder.append("aUrOkIjq");
    }
    e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
    tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT) });
    e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 10_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
    tokenizer = new KeywordTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
    e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 2_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 2000000", e.getMessage());
    tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
    e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 3_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 3000000", e.getMessage());
    tokenizer = new WhitespaceTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) StringReader(java.io.StringReader) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) IOException(java.io.IOException)

Example 5 with LowerCaseTokenizer

use of org.apache.lucene.analysis.core.LowerCaseTokenizer in project lucene-solr by apache.

the class TestCharTokenizers method testExtendCharBuffer.

/*
   * test to extend the buffer TermAttribute buffer internally. If the internal
   * alg that extends the size of the char array only extends by 1 char and the
   * next char to be filled in is a supplementary codepoint (using 2 chars) an
   * index out of bound exception is triggered.
   */
public void testExtendCharBuffer() throws IOException {
    for (int i = 0; i < 40; i++) {
        StringBuilder builder = new StringBuilder();
        for (int j = 0; j < 1 + i; j++) {
            builder.append("a");
        }
        builder.append("𐐜abc");
        Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory());
        tokenizer.setReader(new StringReader(builder.toString()));
        assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT) });
    }
}
Also used : LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) StringReader(java.io.StringReader) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer)

Aggregations

StringReader (java.io.StringReader)7 LowerCaseTokenizer (org.apache.lucene.analysis.core.LowerCaseTokenizer)7 Tokenizer (org.apache.lucene.analysis.Tokenizer)6 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)6 LetterTokenizer (org.apache.lucene.analysis.core.LetterTokenizer)5 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)5 CharArraySet (org.apache.lucene.analysis.CharArraySet)2 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)2 IOException (java.io.IOException)1