Search in sources :

Example 1 with LetterTokenizer

use of org.apache.lucene.analysis.core.LetterTokenizer in project lucene-solr by apache.

the class TestCharTokenizers method testCrossPlaneNormalization2.

// LUCENE-3642: normalize BMP->SMP and check that offsets are correct
public void testCrossPlaneNormalization2() throws IOException {
    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) {

                @Override
                protected int normalize(int c) {
                    if (c <= 0xffff) {
                        return 0x1043C;
                    } else {
                        return c;
                    }
                }
            };
            return new TokenStreamComponents(tokenizer, tokenizer);
        }
    };
    int num = 1000 * RANDOM_MULTIPLIER;
    for (int i = 0; i < num; i++) {
        String s = TestUtil.randomUnicodeString(random());
        try (TokenStream ts = analyzer.tokenStream("foo", s)) {
            ts.reset();
            OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
            while (ts.incrementToken()) {
                String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
                for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
                    cp = highlightedText.codePointAt(j);
                    assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
                }
            }
            ts.end();
        }
    }
    // just for fun
    checkRandomData(random(), analyzer, num);
    analyzer.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer)

Example 2 with LetterTokenizer

use of org.apache.lucene.analysis.core.LetterTokenizer in project lucene-solr by apache.

the class EdgeNGramTokenFilterTest method testGraphs.

public void testGraphs() throws IOException {
    TokenStream tk = new LetterTokenizer();
    ((Tokenizer) tk).setReader(new StringReader("abc d efgh ij klmno p q"));
    tk = new ShingleFilter(tk);
    tk = new EdgeNGramTokenFilter(tk, 7, 10);
    assertTokenStreamContents(tk, new String[] { "efgh ij", "ij klmn", "ij klmno", "klmno p" }, new int[] { 6, 11, 11, 14 }, new int[] { 13, 19, 19, 21 }, new int[] { 3, 1, 0, 1 }, new int[] { 2, 2, 2, 2 }, 23);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) ShingleFilter(org.apache.lucene.analysis.shingle.ShingleFilter) StringReader(java.io.StringReader) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer)

Example 3 with LetterTokenizer

use of org.apache.lucene.analysis.core.LetterTokenizer in project lucene-solr by apache.

the class TestCharTokenizers method testCustomMaxTokenLength.

/*
   * tests the max word length passed as parameter - tokenizer will split at the passed position char no matter what happens
   */
public void testCustomMaxTokenLength() throws IOException {
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < 100; i++) {
        builder.append("A");
    }
    Tokenizer tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 100);
    // Tricky, passing two copies of the string to the reader....
    tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT) });
    Exception e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), -1));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: -1", e.getMessage());
    tokenizer = new LetterTokenizer(newAttributeFactory(), 100);
    tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString(), builder.toString() });
    // Let's test that we can get a token longer than 255 through.
    builder.setLength(0);
    for (int i = 0; i < 500; i++) {
        builder.append("Z");
    }
    tokenizer = new LetterTokenizer(newAttributeFactory(), 500);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
    // Just to be sure what is happening here, token lengths of zero make no sense, 
    // Let's try the edge cases, token > I/O buffer (4096)
    builder.setLength(0);
    for (int i = 0; i < 600; i++) {
        // 600 * 8 = 4800 chars.
        builder.append("aUrOkIjq");
    }
    e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new LowerCaseTokenizer(newAttributeFactory(), 10_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
    tokenizer = new LowerCaseTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString().toLowerCase(Locale.ROOT) });
    e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new KeywordTokenizer(newAttributeFactory(), 10_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 10000000", e.getMessage());
    tokenizer = new KeywordTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
    e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new LetterTokenizer(newAttributeFactory(), 2_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 2000000", e.getMessage());
    tokenizer = new LetterTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
    e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 0));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 0", e.getMessage());
    e = expectThrows(IllegalArgumentException.class, () -> new WhitespaceTokenizer(newAttributeFactory(), 3_000_000));
    assertEquals("maxTokenLen must be greater than 0 and less than 1048576 passed: 3000000", e.getMessage());
    tokenizer = new WhitespaceTokenizer(newAttributeFactory(), 4800);
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(tokenizer, new String[] { builder.toString() });
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) StringReader(java.io.StringReader) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) IOException(java.io.IOException)

Example 4 with LetterTokenizer

use of org.apache.lucene.analysis.core.LetterTokenizer in project lucene-solr by apache.

the class TestCharTokenizers method testCrossPlaneNormalization.

// LUCENE-3642: normalize SMP->BMP and check that offsets are correct
public void testCrossPlaneNormalization() throws IOException {
    Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()) {

                @Override
                protected int normalize(int c) {
                    if (c > 0xffff) {
                        return 'δ';
                    } else {
                        return c;
                    }
                }
            };
            return new TokenStreamComponents(tokenizer, tokenizer);
        }
    };
    int num = 1000 * RANDOM_MULTIPLIER;
    for (int i = 0; i < num; i++) {
        String s = TestUtil.randomUnicodeString(random());
        try (TokenStream ts = analyzer.tokenStream("foo", s)) {
            ts.reset();
            OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
            while (ts.incrementToken()) {
                String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
                for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
                    cp = highlightedText.codePointAt(j);
                    assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
                }
            }
            ts.end();
        }
    }
    // just for fun
    checkRandomData(random(), analyzer, num);
    analyzer.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer) Analyzer(org.apache.lucene.analysis.Analyzer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) LetterTokenizer(org.apache.lucene.analysis.core.LetterTokenizer)

Aggregations

Tokenizer (org.apache.lucene.analysis.Tokenizer)4 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)4 LetterTokenizer (org.apache.lucene.analysis.core.LetterTokenizer)4 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)4 TokenStream (org.apache.lucene.analysis.TokenStream)3 LowerCaseTokenizer (org.apache.lucene.analysis.core.LowerCaseTokenizer)3 StringReader (java.io.StringReader)2 Analyzer (org.apache.lucene.analysis.Analyzer)2 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)2 IOException (java.io.IOException)1 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)1 ShingleFilter (org.apache.lucene.analysis.shingle.ShingleFilter)1