Search in sources :

Example 31 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class AbstractAnalysisFactory method getWordSet.

/**
   * Returns as {@link CharArraySet} from wordFiles, which
   * can be a comma-separated list of filenames
   */
protected final CharArraySet getWordSet(ResourceLoader loader, String wordFiles, boolean ignoreCase) throws IOException {
    List<String> files = splitFileNames(wordFiles);
    CharArraySet words = null;
    if (files.size() > 0) {
        // default stopwords list has 35 or so words, but maybe don't make it that
        // big to start
        words = new CharArraySet(files.size() * 10, ignoreCase);
        for (String file : files) {
            List<String> wlist = getLines(loader, file.trim());
            words.addAll(StopFilter.makeStopSet(wlist, ignoreCase));
        }
    }
    return words;
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet)

Example 32 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestCatalanAnalyzer method testExclude.

/** test use of exclusion set */
public void testExclude() throws IOException {
    CharArraySet exclusionSet = new CharArraySet(asSet("llengües"), false);
    Analyzer a = new CatalanAnalyzer(CatalanAnalyzer.getDefaultStopSet(), exclusionSet);
    checkOneTerm(a, "llengües", "llengües");
    checkOneTerm(a, "llengua", "llengu");
    a.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 33 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestArabicAnalyzer method testWithStemExclusionSet.

public void testWithStemExclusionSet() throws IOException {
    CharArraySet set = new CharArraySet(asSet("ساهدهات"), false);
    ArabicAnalyzer a = new ArabicAnalyzer(CharArraySet.EMPTY_SET, set);
    assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير", "the", "quick", "ساهدهات" });
    assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير", "the", "quick", "ساهدهات" });
    a.close();
    a = new ArabicAnalyzer(CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
    assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير", "the", "quick", "ساهد" });
    assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير", "the", "quick", "ساهد" });
    a.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet)

Example 34 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestArabicAnalyzer method testCustomStopwords.

/**
   * Test that custom stopwords work, and are not case-sensitive.
   */
public void testCustomStopwords() throws Exception {
    CharArraySet set = new CharArraySet(asSet("the", "and", "a"), false);
    ArabicAnalyzer a = new ArabicAnalyzer(set);
    assertAnalyzesTo(a, "The quick brown fox.", new String[] { "quick", "brown", "fox" });
    a.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet)

Example 35 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestKeepFilterFactory method testInform.

public void testInform() throws Exception {
    ResourceLoader loader = new ClasspathResourceLoader(getClass());
    assertTrue("loader is null and it shouldn't be", loader != null);
    KeepWordFilterFactory factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord", "words", "keep-1.txt", "ignoreCase", "true");
    CharArraySet words = factory.getWords();
    assertTrue("words is null and it shouldn't be", words != null);
    assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
    factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord", "words", "keep-1.txt, keep-2.txt", "ignoreCase", "true");
    words = factory.getWords();
    assertTrue("words is null and it shouldn't be", words != null);
    assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
}
Also used : ClasspathResourceLoader(org.apache.lucene.analysis.util.ClasspathResourceLoader) ResourceLoader(org.apache.lucene.analysis.util.ResourceLoader) CharArraySet(org.apache.lucene.analysis.CharArraySet) ClasspathResourceLoader(org.apache.lucene.analysis.util.ClasspathResourceLoader)

Aggregations

CharArraySet (org.apache.lucene.analysis.CharArraySet)137 Analyzer (org.apache.lucene.analysis.Analyzer)54 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)46 Tokenizer (org.apache.lucene.analysis.Tokenizer)43 TokenStream (org.apache.lucene.analysis.TokenStream)37 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)34 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)26 StringReader (java.io.StringReader)23 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)9 StopFilter (org.apache.lucene.analysis.StopFilter)7 TokenFilter (org.apache.lucene.analysis.TokenFilter)6 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)5 WordDelimiterGraphFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)5 ClasspathResourceLoader (org.apache.lucene.analysis.util.ClasspathResourceLoader)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 HyphenationTree (org.apache.lucene.analysis.compound.hyphenation.HyphenationTree)4 ResourceLoader (org.apache.lucene.analysis.util.ResourceLoader)4 InputSource (org.xml.sax.InputSource)4 Reader (java.io.Reader)3 ArrayList (java.util.ArrayList)3