Search in sources :

Example 1 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testLotsOfConcatenating2.

/** concat numbers + words + all + preserve original */
public void testLotsOfConcatenating2() throws Exception {
    final int flags = PRESERVE_ORIGINAL | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
        }
    };
    assertAnalyzesTo(a, "abc-def-123-456", new String[] { "abc-def-123-456", "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, new int[] { 0, 0, 0, 0, 4, 8, 8, 12 }, new int[] { 15, 3, 7, 15, 7, 11, 15, 15 }, null, new int[] { 1, 0, 0, 0, 1, 1, 0, 1 }, null, false);
    a.close();
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 2 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testOffsets.

/*
  public void testPerformance() throws IOException {
    String s = "now is the time-for all good men to come to-the aid of their country.";
    Token tok = new Token();
    long start = System.currentTimeMillis();
    int ret=0;
    for (int i=0; i<1000000; i++) {
      StringReader r = new StringReader(s);
      TokenStream ts = new WhitespaceTokenizer(r);
      ts = new WordDelimiterFilter(ts, 1,1,1,1,0);

      while (ts.next(tok) != null) ret++;
    }

    System.out.println("ret="+ret+" time="+(System.currentTimeMillis()-start));
  }
  ***/
@Test
public void testOffsets() throws IOException {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    // test that subwords and catenated subwords have
    // the correct offsets.
    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("foo-bar", 5, 12)), DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, new String[] { "foo", "foobar", "bar" }, new int[] { 5, 5, 9 }, new int[] { 8, 12, 12 });
    wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("foo-bar", 5, 6)), DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, new String[] { "foo", "bar", "foobar" }, new int[] { 5, 5, 5 }, new int[] { 6, 6, 6 });
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) Test(org.junit.Test)

Example 3 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Random random = random();
    for (int i = 0; i < 512; i++) {
        final int flags = i;
        final CharArraySet protectedWords;
        if (random.nextBoolean()) {
            protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
        } else {
            protectedWords = null;
        }
        Analyzer a = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = new KeywordTokenizer();
                return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
            }
        };
        // depending upon options, this thing may or may not preserve the empty term
        checkAnalysisConsistency(random, a, random.nextBoolean(), "");
        a.close();
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 4 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testNumberPunct.

public void testNumberPunct() throws Exception {
    int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
        }
    };
    assertAnalyzesTo(a, "6-", new String[] { "6" }, new int[] { 0 }, new int[] { 1 }, null, new int[] { 1 }, null, false);
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 5 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testOffsetChange4.

@Test
public void testOffsetChange4() throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, new String[] { "foo", "foobar", "bar" }, new int[] { 8, 8, 12 }, new int[] { 11, 15, 15 });
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) Test(org.junit.Test)

Aggregations

WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)20 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)9 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)9 CharArraySet (org.apache.lucene.analysis.CharArraySet)6 Test (org.junit.Test)6 TokenStream (org.apache.lucene.analysis.TokenStream)5 Tokenizer (org.apache.lucene.analysis.Tokenizer)3 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)3 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)3 Analyzer (org.apache.lucene.analysis.Analyzer)2 StopFilter (org.apache.lucene.analysis.StopFilter)2 StopFilter (org.apache.lucene.analysis.core.StopFilter)2 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)2 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)2 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)2 EdgeNGramTokenizer (org.apache.lucene.analysis.ngram.EdgeNGramTokenizer)2 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)2 ArrayList (java.util.ArrayList)1 Collection (java.util.Collection)1 Collections (java.util.Collections)1