Search in sources :

Example 16 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testRandomHugeStrings.

/** blast some enormous random strings through the analyzer */
public void testRandomHugeStrings() throws Exception {
    int numIterations = atLeast(5);
    for (int i = 0; i < numIterations; i++) {
        final int flags = random().nextInt(512);
        final CharArraySet protectedWords;
        if (random().nextBoolean()) {
            protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
        } else {
            protectedWords = null;
        }
        Analyzer a = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
                return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
            }
        };
        // TODO: properly support positionLengthAttribute
        checkRandomData(random(), a, 20 * RANDOM_MULTIPLIER, 8192, false, false);
        a.close();
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 17 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method doSplitPossessive.

public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
    flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0;
    WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input), flags, null);
    assertTokenStreamContents(wdf, output);
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)

Example 18 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testPositionIncrements.

@Test
public void testPositionIncrements() throws Exception {
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("NUTCH")), false);
    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protWords));
        }
    };
    /* in this case, works as expected. */
    assertAnalyzesTo(a, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, null, new int[] { 1, 1 }, null, false);
    /* only in this case, posInc of 2 ?! */
    assertAnalyzesTo(a, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 12, 13, 13 }, null, new int[] { 1, 1, 0, 1 }, null, false);
    assertAnalyzesTo(a, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, null, new int[] { 1, 1, 1 }, null, false);
    /* analyzer that will consume tokens with large position increments */
    Analyzer a2 = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(new LargePosIncTokenFilter(tokenizer), flags, protWords));
        }
    };
    /* increment of "largegap" is preserved */
    assertAnalyzesTo(a2, "LUCENE largegap SOLR", new String[] { "LUCENE", "largegap", "SOLR" }, new int[] { 0, 7, 16 }, new int[] { 6, 15, 20 }, null, new int[] { 1, 10, 1 }, null, false);
    /* the "/" had a position increment of 10, where did it go?!?!! */
    assertAnalyzesTo(a2, "LUCENE / SOLR", new String[] { "LUCENE", "SOLR" }, new int[] { 0, 9 }, new int[] { 6, 13 }, null, new int[] { 1, 11 }, null, false);
    /* in this case, the increment of 10 from the "/" is carried over */
    assertAnalyzesTo(a2, "LUCENE / solR", new String[] { "LUCENE", "sol", "solR", "R" }, new int[] { 0, 9, 9, 12 }, new int[] { 6, 12, 13, 13 }, null, new int[] { 1, 11, 0, 1 }, null, false);
    assertAnalyzesTo(a2, "LUCENE / NUTCH SOLR", new String[] { "LUCENE", "NUTCH", "SOLR" }, new int[] { 0, 9, 15 }, new int[] { 6, 14, 19 }, null, new int[] { 1, 11, 1 }, null, false);
    Analyzer a3 = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            StopFilter filter = new StopFilter(tokenizer, StandardAnalyzer.STOP_WORDS_SET);
            return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(filter, flags, protWords));
        }
    };
    assertAnalyzesTo(a3, "lucene.solr", new String[] { "lucene", "lucenesolr", "solr" }, new int[] { 0, 0, 7 }, new int[] { 6, 11, 11 }, null, new int[] { 1, 0, 1 }, null, false);
    /* the stopword should add a gap here */
    assertAnalyzesTo(a3, "the lucene.solr", new String[] { "lucene", "lucenesolr", "solr" }, new int[] { 4, 4, 11 }, new int[] { 10, 15, 15 }, null, new int[] { 2, 0, 1 }, null, false);
    IOUtils.close(a, a2, a3);
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StopFilter(org.apache.lucene.analysis.StopFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) Test(org.junit.Test)

Example 19 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method doSplit.

public void doSplit(final String input, String... output) throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterFilter wdf = new WordDelimiterFilter(keywordMockTokenizer(input), WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, output);
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)

Aggregations

WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)19 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)8 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)8 Test (org.junit.Test)6 CharArraySet (org.apache.lucene.analysis.CharArraySet)5 TokenStream (org.apache.lucene.analysis.TokenStream)4 Tokenizer (org.apache.lucene.analysis.Tokenizer)3 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)3 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)3 StopFilter (org.apache.lucene.analysis.core.StopFilter)2 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)2 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)2 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)2 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)2 Analyzer (org.apache.lucene.analysis.Analyzer)1 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)1 StopFilter (org.apache.lucene.analysis.StopFilter)1 EdgeNGramTokenizer (org.apache.lucene.analysis.ngram.EdgeNGramTokenizer)1 WikipediaTokenizer (org.apache.lucene.analysis.wikipedia.WikipediaTokenizer)1