Search in sources :

Example 6 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testOffsetChange4.

@Test
public void testOffsetChange4() throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, new String[] { "foo", "foobar", "bar" }, new int[] { 8, 8, 12 }, new int[] { 11, 15, 15 });
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) Test(org.junit.Test)

Example 7 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testLotsOfConcatenating.

/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
    final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    /* analyzer that uses whitespace + wdf */
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
        }
    };
    assertAnalyzesTo(a, "abc-def-123-456", new String[] { "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, new int[] { 0, 0, 0, 4, 8, 8, 12 }, new int[] { 3, 7, 15, 7, 11, 15, 15 }, null, new int[] { 1, 0, 0, 1, 1, 0, 1 }, null, false);
    a.close();
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 8 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testOnlyNumbers.

/*
  public void testToDot() throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE | PRESERVE_ORIGINAL | CATENATE_WORDS | CATENATE_NUMBERS | STEM_ENGLISH_POSSESSIVE;
    String text = "PowerSystem2000-5-Shot's";
    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token(text, 0, text.length())), DEFAULT_WORD_DELIM_TABLE, flags, null);
    //StringWriter sw = new StringWriter();
    // TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, new PrintWriter(sw));
    PrintWriter pw = new PrintWriter("/x/tmp/before.dot");
    TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, pw);
    toDot.toDot();
    pw.close();
    System.out.println("TEST DONE");
    //System.out.println("DOT:\n" + sw.toString());
  }
  */
public void testOnlyNumbers() throws Exception {
    int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
        }
    };
    assertAnalyzesTo(a, "7-586", new String[] {}, new int[] {}, new int[] {}, null, new int[] {}, null, false);
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 9 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testOffsetChange2.

@Test
public void testOffsetChange2() throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, new String[] { "übelkeit" }, new int[] { 8 }, new int[] { 17 });
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) Test(org.junit.Test)

Example 10 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project cogcomp-nlp by CogComp.

the class ASCIIEnglishAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new LowerCaseFilter(result);
    result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EnglishPossessiveFilter(org.apache.lucene.analysis.en.EnglishPossessiveFilter) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Aggregations

WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)19 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)8 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)8 Test (org.junit.Test)6 CharArraySet (org.apache.lucene.analysis.CharArraySet)5 TokenStream (org.apache.lucene.analysis.TokenStream)4 Tokenizer (org.apache.lucene.analysis.Tokenizer)3 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)3 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)3 StopFilter (org.apache.lucene.analysis.core.StopFilter)2 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)2 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)2 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)2 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)2 Analyzer (org.apache.lucene.analysis.Analyzer)1 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)1 StopFilter (org.apache.lucene.analysis.StopFilter)1 EdgeNGramTokenizer (org.apache.lucene.analysis.ngram.EdgeNGramTokenizer)1 WikipediaTokenizer (org.apache.lucene.analysis.wikipedia.WikipediaTokenizer)1