Search in sources :

Example 11 with WordDelimiterGraphFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.

the class TestWordDelimiterGraphFilter method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Random random = random();
    for (int i = 0; i < 512; i++) {
        final int flags = i;
        final CharArraySet protectedWords;
        if (random.nextBoolean()) {
            protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
        } else {
            protectedWords = null;
        }
        Analyzer a = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = new KeywordTokenizer();
                return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords));
            }
        };
        // depending upon options, this thing may or may not preserve the empty term
        checkAnalysisConsistency(random, a, random.nextBoolean(), "");
        a.close();
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 12 with WordDelimiterGraphFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.

the class TestWordDelimiterGraphFilter method testRandomStrings.

/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
    int numIterations = atLeast(5);
    for (int i = 0; i < numIterations; i++) {
        final int flags = random().nextInt(512);
        final CharArraySet protectedWords;
        if (random().nextBoolean()) {
            protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
        } else {
            protectedWords = null;
        }
        Analyzer a = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
                return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, protectedWords));
            }
        };
        // TODO: properly support positionLengthAttribute
        checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER, 20, false, false);
        a.close();
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 13 with WordDelimiterGraphFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.

the class TestWordDelimiterGraphFilter method testProtectedWords.

public void testProtectedWords() throws Exception {
    TokenStream tokens = new CannedTokenStream(new Token("foo17-bar", 0, 9), new Token("foo-bar", 0, 7));
    CharArraySet protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("foo17-BAR")), true);
    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, DEFAULT_WORD_DELIM_TABLE, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
    assertGraphStrings(wdf, "foo17-bar foo bar", "foo17-bar foo-bar", "foo17-bar foobar");
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)

Example 14 with WordDelimiterGraphFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.

the class TestWordDelimiterGraphFilter method testLotsOfConcatenating2.

/** concat numbers + words + all + preserve original */
public void testLotsOfConcatenating2() throws Exception {
    final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterGraphFilter(tokenizer, flags, null));
        }
    };
    assertAnalyzesTo(a, "abc-def-123-456", new String[] { "abcdef123456", "abc-def-123-456", "abcdef", "abc", "def", "123456", "123", "456" }, new int[] { 0, 0, 0, 0, 4, 8, 8, 12 }, new int[] { 15, 15, 7, 3, 7, 15, 11, 15 }, null, new int[] { 1, 0, 0, 0, 1, 1, 0, 1 }, null, false);
    a.close();
}
Also used : WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 15 with WordDelimiterGraphFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter in project lucene-solr by apache.

the class TestWordDelimiterGraphFilter method doSplit.

public void doSplit(final String input, String... output) throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(keywordMockTokenizer(input), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, output);
}
Also used : WordDelimiterGraphFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)

Aggregations

WordDelimiterGraphFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)16 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)6 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)6 CharArraySet (org.apache.lucene.analysis.CharArraySet)5 StopFilter (org.apache.lucene.analysis.StopFilter)1