Search in sources :

Example 11 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project cogcomp-nlp by CogComp.

the class MinimalAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer source = new StandardTokenizer();
    TokenStream result = new StandardFilter(source);
    result = new ASCIIFoldingFilter(result);
    result = new LowerCaseFilter(result);
    result = new EnglishPossessiveFilter(result);
    result = new StopFilter(result, stopwords);
    result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
    result = new PorterStemFilter(result);
    return new TokenStreamComponents(source, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) EnglishPossessiveFilter(org.apache.lucene.analysis.en.EnglishPossessiveFilter) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) StopFilter(org.apache.lucene.analysis.core.StopFilter) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) PorterStemFilter(org.apache.lucene.analysis.en.PorterStemFilter) ASCIIFoldingFilter(org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter) Tokenizer(org.apache.lucene.analysis.Tokenizer) StandardTokenizer(org.apache.lucene.analysis.standard.StandardTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 12 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestBugInSomething method testCuriousWikipediaString.

public void testCuriousWikipediaString() throws Exception {
    final CharArraySet protWords = new CharArraySet(new HashSet<>(Arrays.asList("rrdpafa", "pupmmlu", "xlq", "dyy", "zqrxrrck", "o", "hsrlfvcha")), false);
    final byte[] table = new byte[] { -57, 26, 1, 48, 63, -23, 55, -84, 18, 120, -97, 103, 58, 13, 84, 89, 57, -13, -63, 5, 28, 97, -54, -94, 102, -108, -5, 5, 46, 40, 43, 78, 43, -72, 36, 29, 124, -106, -22, -51, 65, 5, 31, -42, 6, -99, 97, 14, 81, -128, 74, 100, 54, -55, -25, 53, -71, -98, 44, 33, 86, 106, -42, 47, 115, -89, -18, -26, 22, -95, -43, 83, -125, 105, -104, -24, 106, -16, 126, 115, -105, 97, 65, -33, 57, 44, -1, 123, -68, 100, 13, -41, -64, -119, 0, 92, 94, -36, 53, -9, -102, -18, 90, 94, -26, 31, 71, -20 };
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new WikipediaTokenizer();
            TokenStream stream = new SopTokenFilter(tokenizer);
            stream = new WordDelimiterFilter(stream, table, -50, protWords);
            stream = new SopTokenFilter(stream);
            return new TokenStreamComponents(tokenizer, stream);
        }
    };
    checkAnalysisConsistency(random(), a, false, "B⣃[ 𐏂 </p> jb");
    a.close();
}
Also used : WikipediaTokenizer(org.apache.lucene.analysis.wikipedia.WikipediaTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) Analyzer(org.apache.lucene.analysis.Analyzer) WikipediaTokenizer(org.apache.lucene.analysis.wikipedia.WikipediaTokenizer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) EdgeNGramTokenizer(org.apache.lucene.analysis.ngram.EdgeNGramTokenizer)

Example 13 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testOffsetChange3.

@Test
public void testOffsetChange3() throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, new String[] { "übelkeit" }, new int[] { 8 }, new int[] { 16 });
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) Test(org.junit.Test)

Example 14 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testRandomStrings.

/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
    int numIterations = atLeast(5);
    for (int i = 0; i < numIterations; i++) {
        final int flags = random().nextInt(512);
        final CharArraySet protectedWords;
        if (random().nextBoolean()) {
            protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
        } else {
            protectedWords = null;
        }
        Analyzer a = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
                return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
            }
        };
        // TODO: properly support positionLengthAttribute
        checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER, 20, false, false);
        a.close();
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 15 with WordDelimiterFilter

use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.

the class TestWordDelimiterFilter method testOffsetChange.

@Test
public void testOffsetChange() throws Exception {
    int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
    WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("übelkeit)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
    assertTokenStreamContents(wdf, new String[] { "übelkeit" }, new int[] { 7 }, new int[] { 15 });
}
Also used : WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) Test(org.junit.Test)

Aggregations

WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)19 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)8 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)8 Test (org.junit.Test)6 CharArraySet (org.apache.lucene.analysis.CharArraySet)5 TokenStream (org.apache.lucene.analysis.TokenStream)4 Tokenizer (org.apache.lucene.analysis.Tokenizer)3 LowerCaseFilter (org.apache.lucene.analysis.core.LowerCaseFilter)3 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)3 StopFilter (org.apache.lucene.analysis.core.StopFilter)2 EnglishPossessiveFilter (org.apache.lucene.analysis.en.EnglishPossessiveFilter)2 PorterStemFilter (org.apache.lucene.analysis.en.PorterStemFilter)2 ASCIIFoldingFilter (org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter)2 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)2 Analyzer (org.apache.lucene.analysis.Analyzer)1 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)1 StopFilter (org.apache.lucene.analysis.StopFilter)1 EdgeNGramTokenizer (org.apache.lucene.analysis.ngram.EdgeNGramTokenizer)1 WikipediaTokenizer (org.apache.lucene.analysis.wikipedia.WikipediaTokenizer)1