Search in sources :

Example 1 with HyphenationTree

use of org.apache.lucene.analysis.compound.hyphenation.HyphenationTree in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testHyphenationCompoundWordsDA.

public void testHyphenationCompoundWordsDA() throws Exception {
    CharArraySet dict = makeDictionary("læse", "hest");
    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("min veninde som er lidt af en læsehest"), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
    assertTokenStreamContents(tf, new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 });
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) InputSource(org.xml.sax.InputSource) HyphenationTree(org.apache.lucene.analysis.compound.hyphenation.HyphenationTree)

Example 2 with HyphenationTree

use of org.apache.lucene.analysis.compound.hyphenation.HyphenationTree in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testRandomStrings.

/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
    final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
        }
    };
    checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
    a.close();
    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
    final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
    Analyzer b = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
    checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
    b.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) InputSource(org.xml.sax.InputSource) HyphenationTree(org.apache.lucene.analysis.compound.hyphenation.HyphenationTree) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Example 3 with HyphenationTree

use of org.apache.lucene.analysis.compound.hyphenation.HyphenationTree in project lucene-solr by apache.

the class HyphenationCompoundWordTokenFilter method getHyphenationTree.

/**
   * Create a hyphenator tree
   *
   * @param hyphenationSource the InputSource pointing to the XML grammar
   * @return An object representing the hyphenation patterns
   * @throws java.io.IOException If there is a low-level I/O error.
   */
public static HyphenationTree getHyphenationTree(InputSource hyphenationSource) throws IOException {
    HyphenationTree tree = new HyphenationTree();
    tree.loadPatterns(hyphenationSource);
    return tree;
}
Also used : HyphenationTree(org.apache.lucene.analysis.compound.hyphenation.HyphenationTree)

Example 4 with HyphenationTree

use of org.apache.lucene.analysis.compound.hyphenation.HyphenationTree in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testHyphenationOnly.

/**
   * With hyphenation-only, you can get a lot of nonsense tokens.
   * This can be controlled with the min/max subword size.
   */
public void testHyphenationOnly() throws Exception {
    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("basketballkurv"), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4);
    // min=2, max=4
    assertTokenStreamContents(tf, new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" });
    tf = new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("basketballkurv"), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6);
    // min=4, max=6
    assertTokenStreamContents(tf, new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" });
    tf = new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("basketballkurv"), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10);
    // min=4, max=10
    assertTokenStreamContents(tf, new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" });
}
Also used : InputSource(org.xml.sax.InputSource) HyphenationTree(org.apache.lucene.analysis.compound.hyphenation.HyphenationTree)

Example 5 with HyphenationTree

use of org.apache.lucene.analysis.compound.hyphenation.HyphenationTree in project lucene-solr by apache.

the class TestCompoundWordTokenFilter method testHyphenationCompoundWordsDELongestMatch.

public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
    CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");
    InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
    HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
    // the word basket will not be added due to the longest match option
    HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("basketballkurv"), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
    assertTokenStreamContents(tf, new String[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 });
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) InputSource(org.xml.sax.InputSource) HyphenationTree(org.apache.lucene.analysis.compound.hyphenation.HyphenationTree)

Aggregations

HyphenationTree (org.apache.lucene.analysis.compound.hyphenation.HyphenationTree)6 InputSource (org.xml.sax.InputSource)5 CharArraySet (org.apache.lucene.analysis.CharArraySet)4 Analyzer (org.apache.lucene.analysis.Analyzer)2 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)2 TokenFilter (org.apache.lucene.analysis.TokenFilter)2 Tokenizer (org.apache.lucene.analysis.Tokenizer)2 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)2