Search in sources :

Example 6 with WordDelimiterGraphFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.

the class TestWordDelimiterGraphFilter method testEmptyString.

public void testEmptyString() throws Exception {
    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("", 0, 0)), DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | CATENATE_ALL | PRESERVE_ORIGINAL, null);
    wdf.reset();
    assertTrue(wdf.incrementToken());
    assertFalse(wdf.incrementToken());
    wdf.end();
    wdf.close();
}
Also used : WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)

Example 7 with WordDelimiterGraphFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.

the class TestWordDelimiterGraphFilter method testRandomHugeStrings.

/** blast some enormous random strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
    int numIterations = atLeast(5);
    for (int i = 0; i < numIterations; i++) {
        final int flags = random().nextInt(512);
        final CharArraySet protectedWords;
        if (random().nextBoolean()) {
            protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
        } else {
            protectedWords = null;
        }
        Analyzer a = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
                TokenStream wdgf = new WordDelimiterGraphFilter(tokenizer, flags, protectedWords);
                return new TokenStreamComponents(tokenizer, wdgf);
            }
        };
        // TODO: properly support positionLengthAttribute
        checkRandomData(random(), a, 20 * RANDOM_MULTIPLIER, 8192, false, false);
        a.close();
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 8 with WordDelimiterGraphFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.

the class TestWordDelimiterGraphFilter method testOffsets.

public void testOffsets() throws IOException {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    // test that subwords and catenated subwords have
    // the correct offsets.
    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, new String[] { "foobar", "foo", "bar" }, new int[] { 5, 5, 9 }, new int[] { 12, 8, 12 });
    // with illegal offsets:
    wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, new String[] { "foobar", "foo", "bar" }, new int[] { 5, 5, 5 }, new int[] { 6, 6, 6 });
}
Also used : WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)

Example 9 with WordDelimiterGraphFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.

the class TestWordDelimiterGraphFilter method testLotsOfConcatenating.

/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null));
        }
    };
    assertAnalyzesTo(a, "abc-def-123-456", new String[] { "abcdef123456", "abcdef", "abc", "def", "123456", "123", "456" }, new int[] { 0, 0, 0, 4, 8, 8, 12 }, new int[] { 15, 7, 3, 7, 15, 11, 15 }, null, new int[] { 1, 0, 0, 1, 1, 0, 1 }, null, false);
    a.close();
}
Also used : WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 10 with WordDelimiterGraphFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.

the class TestWordDelimiterGraphFilter method testOffsetChange4.

public void testOffsetChange4() throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, new String[] { "foobar", "foo", "bar" }, new int[] { 8, 8, 12 }, new int[] { 15, 11, 15 });
}
Also used : WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)

Aggregations

WordDelimiterGraphFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)16 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)6 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)6 CharArraySet (org.apache.lucene.analysis.CharArraySet)5 StopFilter (org.apache.lucene.analysis.StopFilter)1