use of org.apache.lucene.analysis.compound.hyphenation.HyphenationTree in project lucene-solr by apache.
the class TestCompoundWordTokenFilter method testHyphenationCompoundWordsDA.
public void testHyphenationCompoundWordsDA() throws Exception {
CharArraySet dict = makeDictionary("læse", "hest");
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("min veninde som er lidt af en læsehest"), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
assertTokenStreamContents(tf, new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 });
}
use of org.apache.lucene.analysis.compound.hyphenation.HyphenationTree in project lucene-solr by apache.
the class TestCompoundWordTokenFilter method testRandomStrings.
/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
final CharArraySet dict = makeDictionary("a", "e", "i", "o", "u", "y", "bc", "def");
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new DictionaryCompoundWordTokenFilter(tokenizer, dict));
}
};
checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
a.close();
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
final HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
Analyzer b = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
TokenFilter filter = new HyphenationCompoundWordTokenFilter(tokenizer, hyphenator);
return new TokenStreamComponents(tokenizer, filter);
}
};
checkRandomData(random(), b, 1000 * RANDOM_MULTIPLIER);
b.close();
}
use of org.apache.lucene.analysis.compound.hyphenation.HyphenationTree in project lucene-solr by apache.
the class HyphenationCompoundWordTokenFilter method getHyphenationTree.
/**
* Create a hyphenator tree
*
* @param hyphenationSource the InputSource pointing to the XML grammar
* @return An object representing the hyphenation patterns
* @throws java.io.IOException If there is a low-level I/O error.
*/
public static HyphenationTree getHyphenationTree(InputSource hyphenationSource) throws IOException {
HyphenationTree tree = new HyphenationTree();
tree.loadPatterns(hyphenationSource);
return tree;
}
use of org.apache.lucene.analysis.compound.hyphenation.HyphenationTree in project lucene-solr by apache.
the class TestCompoundWordTokenFilter method testHyphenationOnly.
/**
* With hyphenation-only, you can get a lot of nonsense tokens.
* This can be controlled with the min/max subword size.
*/
public void testHyphenationOnly() throws Exception {
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("basketballkurv"), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 2, 4);
// min=2, max=4
assertTokenStreamContents(tf, new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" });
tf = new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("basketballkurv"), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 6);
// min=4, max=6
assertTokenStreamContents(tf, new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" });
tf = new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("basketballkurv"), hyphenator, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, 4, 10);
// min=4, max=10
assertTokenStreamContents(tf, new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket", "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" });
}
use of org.apache.lucene.analysis.compound.hyphenation.HyphenationTree in project lucene-solr by apache.
the class TestCompoundWordTokenFilter method testHyphenationCompoundWordsDELongestMatch.
public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");
InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter.getHyphenationTree(is);
// the word basket will not be added due to the longest match option
HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(whitespaceMockTokenizer("basketballkurv"), hyphenator, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
assertTokenStreamContents(tf, new String[] { "basketballkurv", "basketball", "ball", "kurv" }, new int[] { 1, 0, 0, 0 });
}
Aggregations