Search in sources :

Example 6 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestKeepFilterFactory method testInform.

public void testInform() throws Exception {
    ResourceLoader loader = new ClasspathResourceLoader(getClass());
    assertTrue("loader is null and it shouldn't be", loader != null);
    KeepWordFilterFactory factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord", "words", "keep-1.txt", "ignoreCase", "true");
    CharArraySet words = factory.getWords();
    assertTrue("words is null and it shouldn't be", words != null);
    assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
    factory = (KeepWordFilterFactory) tokenFilterFactory("KeepWord", "words", "keep-1.txt, keep-2.txt", "ignoreCase", "true");
    words = factory.getWords();
    assertTrue("words is null and it shouldn't be", words != null);
    assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
}
Also used : ClasspathResourceLoader(org.apache.lucene.analysis.util.ClasspathResourceLoader) ResourceLoader(org.apache.lucene.analysis.util.ResourceLoader) CharArraySet(org.apache.lucene.analysis.CharArraySet) ClasspathResourceLoader(org.apache.lucene.analysis.util.ClasspathResourceLoader)

Example 7 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestKeepWordFilter method testStopAndGo.

public void testStopAndGo() throws Exception {
    Set<String> words = new HashSet<>();
    words.add("aaa");
    words.add("bbb");
    String input = "xxx yyy aaa zzz BBB ccc ddd EEE";
    // Test Stopwords
    TokenStream stream = whitespaceMockTokenizer(input);
    stream = new KeepWordFilter(stream, new CharArraySet(words, true));
    assertTokenStreamContents(stream, new String[] { "aaa", "BBB" }, new int[] { 3, 2 });
    // Now force case
    stream = whitespaceMockTokenizer(input);
    stream = new KeepWordFilter(stream, new CharArraySet(words, false));
    assertTokenStreamContents(stream, new String[] { "aaa" }, new int[] { 3 });
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) HashSet(java.util.HashSet)

Example 8 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestKeepWordFilter method testRandomStrings.

/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
    final Set<String> words = new HashSet<>();
    words.add("a");
    words.add("b");
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenStream stream = new KeepWordFilter(tokenizer, new CharArraySet(words, true));
            return new TokenStreamComponents(tokenizer, stream);
        }
    };
    checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
    a.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) HashSet(java.util.HashSet)

Example 9 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestCapitalizationFilter method testCapitalization.

public void testCapitalization() throws Exception {
    CharArraySet keep = new CharArraySet(Arrays.asList("and", "the", "it", "BIG"), false);
    assertCapitalizesTo("kiTTEN", new String[] { "Kitten" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    assertCapitalizesTo("and", new String[] { "And" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    assertCapitalizesTo("AnD", new String[] { "And" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    //first is not forced, but it's not a keep word, either
    assertCapitalizesTo("AnD", new String[] { "And" }, true, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    assertCapitalizesTo("big", new String[] { "Big" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    assertCapitalizesTo("BIG", new String[] { "BIG" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    assertCapitalizesToKeyword("Hello thEre my Name is Ryan", "Hello there my name is ryan", true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    // now each token
    assertCapitalizesTo("Hello thEre my Name is Ryan", new String[] { "Hello", "There", "My", "Name", "Is", "Ryan" }, false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    // now only the long words
    assertCapitalizesTo("Hello thEre my Name is Ryan", new String[] { "Hello", "There", "my", "Name", "is", "Ryan" }, false, keep, true, null, 3, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    // without prefix
    assertCapitalizesTo("McKinley", new String[] { "Mckinley" }, true, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    // Now try some prefixes
    List<char[]> okPrefix = new ArrayList<>();
    okPrefix.add("McK".toCharArray());
    assertCapitalizesTo("McKinley", new String[] { "McKinley" }, true, keep, true, okPrefix, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    // now try some stuff with numbers
    assertCapitalizesTo("1st 2nd third", new String[] { "1st", "2nd", "Third" }, false, keep, false, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
    assertCapitalizesToKeyword("the The the", "The The the", false, keep, true, null, 0, DEFAULT_MAX_WORD_COUNT, DEFAULT_MAX_TOKEN_LENGTH);
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) ArrayList(java.util.ArrayList)

Example 10 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestTurkishAnalyzer method testExclude.

/** test use of exclusion set */
public void testExclude() throws IOException {
    CharArraySet exclusionSet = new CharArraySet(asSet("ağacı"), false);
    Analyzer a = new TurkishAnalyzer(TurkishAnalyzer.getDefaultStopSet(), exclusionSet);
    checkOneTerm(a, "ağacı", "ağacı");
    checkOneTerm(a, "ağaç", "ağaç");
    a.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) Analyzer(org.apache.lucene.analysis.Analyzer)

Aggregations

CharArraySet (org.apache.lucene.analysis.CharArraySet)137 Analyzer (org.apache.lucene.analysis.Analyzer)54 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)46 Tokenizer (org.apache.lucene.analysis.Tokenizer)43 TokenStream (org.apache.lucene.analysis.TokenStream)37 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)34 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)26 StringReader (java.io.StringReader)23 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)9 StopFilter (org.apache.lucene.analysis.StopFilter)7 TokenFilter (org.apache.lucene.analysis.TokenFilter)6 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)5 WordDelimiterGraphFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)5 ClasspathResourceLoader (org.apache.lucene.analysis.util.ClasspathResourceLoader)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 HyphenationTree (org.apache.lucene.analysis.compound.hyphenation.HyphenationTree)4 ResourceLoader (org.apache.lucene.analysis.util.ResourceLoader)4 InputSource (org.xml.sax.InputSource)4 Reader (java.io.Reader)3 ArrayList (java.util.ArrayList)3