Search in sources :

Example 21 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestWordDelimiterFilter method testEmptyTerm.

public void testEmptyTerm() throws IOException {
    Random random = random();
    for (int i = 0; i < 512; i++) {
        final int flags = i;
        final CharArraySet protectedWords;
        if (random.nextBoolean()) {
            protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("a", "b", "cd")), false);
        } else {
            protectedWords = null;
        }
        Analyzer a = new Analyzer() {

            @Override
            protected TokenStreamComponents createComponents(String fieldName) {
                Tokenizer tokenizer = new KeywordTokenizer();
                return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, protectedWords));
            }
        };
        // depending upon options, this thing may or may not preserve the empty term
        checkAnalysisConsistency(random, a, random.nextBoolean(), "");
        a.close();
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) WordDelimiterFilter(org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer)

Example 22 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class JapanesePartOfSpeechStopFilterFactory method inform.

@Override
public void inform(ResourceLoader loader) throws IOException {
    stopTags = null;
    CharArraySet cas = getWordSet(loader, stopTagFiles, false);
    if (cas != null) {
        stopTags = new HashSet<>();
        for (Object element : cas) {
            char[] chars = (char[]) element;
            stopTags.add(new String(chars));
        }
    }
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet)

Example 23 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestJapaneseNumberFilter method testName.

@Test
public void testName() throws IOException {
    // Test name that normalises to number
    assertAnalyzesTo(analyzer, "田中京一", // 京一 is normalized to a number
    new String[] { "田中", "10000000000000001" }, new int[] { 0, 2 }, new int[] { 2, 4 }, new int[] { 1, 1 });
    // An analyzer that marks 京一 as a keyword
    Analyzer keywordMarkingAnalyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            CharArraySet set = new CharArraySet(1, false);
            set.add("京一");
            Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
            return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(new SetKeywordMarkerFilter(tokenizer, set)));
        }
    };
    assertAnalyzesTo(keywordMarkingAnalyzer, "田中京一", // 京一 is not normalized
    new String[] { "田中", "京一" }, new int[] { 0, 2 }, new int[] { 2, 4 }, new int[] { 1, 1 });
    keywordMarkingAnalyzer.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) SetKeywordMarkerFilter(org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) Test(org.junit.Test)

Example 24 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestCharArrayMap method testMethods.

public void testMethods() {
    CharArrayMap<Integer> cm = new CharArrayMap<>(2, false);
    HashMap<String, Integer> hm = new HashMap<>();
    hm.put("foo", 1);
    hm.put("bar", 2);
    cm.putAll(hm);
    assertEquals(hm.size(), cm.size());
    hm.put("baz", 3);
    cm.putAll(hm);
    assertEquals(hm.size(), cm.size());
    CharArraySet cs = cm.keySet();
    int n = 0;
    for (Object o : cs) {
        assertTrue(cm.containsKey(o));
        char[] co = (char[]) o;
        assertTrue(cm.containsKey(co, 0, co.length));
        n++;
    }
    assertEquals(hm.size(), n);
    assertEquals(hm.size(), cs.size());
    assertEquals(cm.size(), cs.size());
    cs.clear();
    assertEquals(0, cs.size());
    assertEquals(0, cm.size());
    // keySet() should not allow adding new keys
    expectThrows(UnsupportedOperationException.class, () -> {
        cs.add("test");
    });
    cm.putAll(hm);
    assertEquals(hm.size(), cs.size());
    assertEquals(cm.size(), cs.size());
    Iterator<Map.Entry<Object, Integer>> iter1 = cm.entrySet().iterator();
    n = 0;
    while (iter1.hasNext()) {
        Map.Entry<Object, Integer> entry = iter1.next();
        Object key = entry.getKey();
        Integer val = entry.getValue();
        assertEquals(cm.get(key), val);
        entry.setValue(val * 100);
        assertEquals(val * 100, (int) cm.get(key));
        n++;
    }
    assertEquals(hm.size(), n);
    cm.clear();
    cm.putAll(hm);
    assertEquals(cm.size(), n);
    CharArrayMap<Integer>.EntryIterator<Integer> iter2 = cm.entrySet().iterator();
    n = 0;
    while (iter2.hasNext()) {
        char[] keyc = iter2.nextKey();
        Integer val = iter2.currentValue();
        assertEquals(hm.get(new String(keyc)), val);
        iter2.setValue(val * 100);
        assertEquals(val * 100, (int) cm.get(keyc));
        n++;
    }
    assertEquals(hm.size(), n);
    cm.entrySet().clear();
    assertEquals(0, cm.size());
    assertEquals(0, cm.entrySet().size());
    assertTrue(cm.isEmpty());
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) CharArrayMap(org.apache.lucene.analysis.CharArrayMap) CharArrayMap(org.apache.lucene.analysis.CharArrayMap)

Example 25 with CharArraySet

use of org.apache.lucene.analysis.CharArraySet in project lucene-solr by apache.

the class TestStopFilter method testStopFilt.

public void testStopFilt() throws IOException {
    StringReader reader = new StringReader("Now is The Time");
    String[] stopWords = new String[] { "is", "the", "Time" };
    CharArraySet stopSet = StopFilter.makeStopSet(stopWords);
    final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
    in.setReader(reader);
    TokenStream stream = new StopFilter(in, stopSet);
    assertTokenStreamContents(stream, new String[] { "Now", "The" });
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) StringReader(java.io.StringReader)

Aggregations

CharArraySet (org.apache.lucene.analysis.CharArraySet)137 Analyzer (org.apache.lucene.analysis.Analyzer)54 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)46 Tokenizer (org.apache.lucene.analysis.Tokenizer)43 TokenStream (org.apache.lucene.analysis.TokenStream)37 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)34 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)26 StringReader (java.io.StringReader)23 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)9 StopFilter (org.apache.lucene.analysis.StopFilter)7 TokenFilter (org.apache.lucene.analysis.TokenFilter)6 WordDelimiterFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter)5 WordDelimiterGraphFilter (org.apache.lucene.analysis.miscellaneous.WordDelimiterGraphFilter)5 ClasspathResourceLoader (org.apache.lucene.analysis.util.ClasspathResourceLoader)5 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)4 HyphenationTree (org.apache.lucene.analysis.compound.hyphenation.HyphenationTree)4 ResourceLoader (org.apache.lucene.analysis.util.ResourceLoader)4 InputSource (org.xml.sax.InputSource)4 Reader (java.io.Reader)3 ArrayList (java.util.ArrayList)3