Search in sources :

Example 66 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testDumbCompoundWordsSE.

public void testDumbCompoundWordsSE() throws Exception {
    CharArraySet dict = makeDictionary("Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar", "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll", "Sko", "Vind", "Rute", "Torkare", "Blad");
    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(whitespaceMockTokenizer("Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"), dict);
    assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor", "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr", "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr", "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas", "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol", "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare", "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad", "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 0, 8, 8, 8, 17, 17, 17, 24, 24, 24, 33, 33, 33, 44, 44, 44, 54, 54, 54, 54, 69, 69, 69, 69, 84, 84, 84, 84, 84, 84, 111, 111, 111, 121, 121, 121, 121, 137, 137, 137, 137, 156 }, new int[] { 7, 7, 7, 16, 16, 16, 23, 23, 23, 32, 32, 32, 43, 43, 43, 53, 53, 53, 68, 68, 68, 68, 83, 83, 83, 83, 110, 110, 110, 110, 110, 110, 120, 120, 120, 136, 136, 136, 136, 155, 155, 155, 155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 });
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet)

Example 67 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testWordComponentWithLessThanMinimumLength.

public void testWordComponentWithLessThanMinimumLength() throws Exception {
    CharArraySet dict = makeDictionary("abc", "d", "efg");
    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(new StringReader("abcdefg"));
    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(tokenizer, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
    // since "d" is shorter than the minimum subword size, it should not be added to the token stream
    assertTokenStreamContents(tf, new String[] { "abcdefg", "abc", "efg" }, new int[] { 0, 0, 0 }, new int[] { 7, 7, 7 }, new int[] { 1, 0, 0 });
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 68 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testTokenEndingWithWordComponentOfMinimumLength.

public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
    CharArraySet dict = makeDictionary("ab", "cd", "ef");
    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(new StringReader("abcdef"));
    DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(tokenizer, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
    assertTokenStreamContents(tf, new String[] { "abcdef", "ab", "cd", "ef" }, new int[] { 0, 0, 0, 0 }, new int[] { 6, 6, 6, 6 }, new int[] { 1, 0, 0, 0 });
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 69 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testHyphenationCompoundWordsDA.

public void testHyphenationCompoundWordsDA() throws Exception {
    CharArraySet dict = makeDictionary("læse", "hest");
    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("min veninde som er lidt af en læsehest"), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
    assertTokenStreamContents(tf, new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 });
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) InputSource(org.xml.sax.InputSource) HyphenationTree(org.apache.lucene.analysis.compound.hyphenation.HyphenationTree)

Example 70 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testRetainMockAttribute.

public void testRetainMockAttribute() throws Exception {
    CharArraySet dict = makeDictionary("abc", "d", "efg");
    Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    tokenizer.setReader(new StringReader("abcdefg"));
    TokenStream stream = new MockRetainAttributeFilter(tokenizer);
    stream = new DictionaryCompoundWordTokenFilter(stream, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
    MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        assertTrue("Custom attribute value was lost", retAtt.getRetain());
    }
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Aggregations

CharArraySet (org.apache.lucene.analysis.CharArraySet)137 Analyzer (org.apache.lucene.analysis.Analyzer)54 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)46 Tokenizer (org.apache.lucene.analysis.Tokenizer)43 TokenStream (org.apache.lucene.analysis.TokenStream)37 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)34 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)26 StringReader (java.io.StringReader)23 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)9 StopFilter (org.apache.lucene.analysis.StopFilter)7 TokenFilter (org.apache.lucene.analysis.TokenFilter)6 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)5 WordDelimiterGraphFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)5 ClasspathResourceLoader (org.apache.lucene.analysis.util.ClasspathResourceLoader)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 HyphenationTree (org.apache.lucene.analysis.compound.hyphenation.HyphenationTree)4 ResourceLoader (org.apache.lucene.analysis.util.ResourceLoader)4 InputSource (org.xml.sax.InputSource)4 Reader (java.io.Reader)3 ArrayList (java.util.ArrayList)3