Search in sources :

Example 71 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testRandomStrings.

/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
    final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
        }
    };
    checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
    a.close();
    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
    final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
    Analyzer b = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
    checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
    b.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) InputSource(org.xml.sax.InputSource) HyphenationTree(org.apache.lucene.analysis.compound.hyphenation.HyphenationTree) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Example 72 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestSoraniAnalyzer method testWithStemExclusionSet.

public void testWithStemExclusionSet() throws IOException {
    CharArraySet set = new CharArraySet(1, true);
    set.add("پیاوە");
    Analyzer a = new SoraniAnalyzer(CharArraySet.EMPTY_SET, set);
    assertAnalyzesTo(a, "پیاوە", new String[] { "پیاوە" });
    a.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 73 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestNorwegianLightStemFilter method testKeyword.

public void testKeyword() throws IOException {
    final CharArraySet exclusionSet = new CharArraySet(asSet("sekretæren"), false);
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
            return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
        }
    };
    checkOneTerm(a, "sekretæren", "sekretæren");
    a.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 74 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project jena by apache.

the class StandardAnalyzerAssembler method analyzerWithStopWords.

private Analyzer analyzerWithStopWords(Resource root) {
    RDFNode node = root.getProperty(TextVocab.pStopWords).getObject();
    if (!node.isResource()) {
        throw new TextIndexException("text:stopWords property takes a list as a value : " + node);
    }
    CharArraySet stopWords = toCharArraySet((Resource) node);
    return new StandardAnalyzer(stopWords);
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) TextIndexException(org.apache.jena.query.text.TextIndexException) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) RDFNode(org.apache.jena.rdf.model.RDFNode)

Example 75 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestGermanAnalyzer method testWithKeywordAttribute.

public void testWithKeywordAttribute() throws IOException {
    CharArraySet set = new CharArraySet(1, true);
    set.add("fischen");
    final LowerCaseTokenizer in = new LowerCaseTokenizer();
    in.setReader(new StringReader("Fischen Trinken"));
    GermanStemFilter filter = new GermanStemFilter(new SetKeywordMarkerFilter(in, set));
    assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) LowerCaseTokenizer(org.apache.lucene.analysis.core.LowerCaseTokenizer) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) StringReader(java.io.StringReader)

Aggregations

CharArraySet (org.apache.lucene.analysis.CharArraySet)137 Analyzer (org.apache.lucene.analysis.Analyzer)54 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)46 Tokenizer (org.apache.lucene.analysis.Tokenizer)43 TokenStream (org.apache.lucene.analysis.TokenStream)37 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)34 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)26 StringReader (java.io.StringReader)23 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)9 StopFilter (org.apache.lucene.analysis.StopFilter)7 TokenFilter (org.apache.lucene.analysis.TokenFilter)6 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)5 WordDelimiterGraphFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)5 ClasspathResourceLoader (org.apache.lucene.analysis.util.ClasspathResourceLoader)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 HyphenationTree (org.apache.lucene.analysis.compound.hyphenation.HyphenationTree)4 ResourceLoader (org.apache.lucene.analysis.util.ResourceLoader)4 InputSource (org.xml.sax.InputSource)4 Reader (java.io.Reader)3 ArrayList (java.util.ArrayList)3