use of org.apache.lucene.analysis.standard.StandardTokenizer in project lucene-solr by apache.
the class TestCJKBigramFilter method testHanOnly.
public void testHanOnly() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。", new String[] { "多", "く", "の", "学生", "が", "試験", "に", "落", "ち", "た" }, new int[] { 0, 1, 2, 3, 5, 6, 8, 9, 10, 11 }, new int[] { 1, 2, 3, 5, 6, 8, 9, 10, 11, 12 }, new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<DOUBLE>", "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 });
a.close();
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project lucene-solr by apache.
the class TestCJKBigramFilter method testAllScripts.
public void testAllScripts() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new CJKBigramFilter(t, 0xff, false));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。", new String[] { "多く", "くの", "の学", "学生", "生が", "が試", "試験", "験に", "に落", "落ち", "ちた" });
a.close();
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project lucene-solr by apache.
the class TestCJKBigramFilter method testUnigramsAndBigramsHanOnly.
public void testUnigramsAndBigramsHanOnly() throws Exception {
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN, true));
}
};
assertAnalyzesTo(a, "多くの学生が試験に落ちた。", new String[] { "多", "く", "の", "学", "学生", "生", "が", "試", "試験", "験", "に", "落", "ち", "た" }, new int[] { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 10, 11 }, new int[] { 1, 2, 3, 4, 5, 5, 6, 7, 8, 8, 9, 10, 11, 12 }, new String[] { "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<DOUBLE>", "<SINGLE>", "<HIRAGANA>", "<SINGLE>", "<HIRAGANA>", "<HIRAGANA>", "<SINGLE>" }, new int[] { 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1 }, new int[] { 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1 });
a.close();
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project lucene-solr by apache.
the class TestCJKBigramFilter method setUp.
@Override
public void setUp() throws Exception {
super.setUp();
analyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new CJKBigramFilter(t));
}
};
unibiAnalyzer = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer t = new StandardTokenizer();
return new TokenStreamComponents(t, new CJKBigramFilter(t, 0xff, true));
}
};
}
use of org.apache.lucene.analysis.standard.StandardTokenizer in project lucene-solr by apache.
the class TestTypeTokenFilter method testStopPositons.
/**
* Test Position increments applied by TypeTokenFilter with and without enabling this option.
*/
public void testStopPositons() throws IOException {
StringBuilder sb = new StringBuilder();
for (int i = 10; i < 20; i++) {
if (i % 3 != 0) {
sb.append(i).append(" ");
} else {
String w = English.intToEnglish(i).trim();
sb.append(w).append(" ");
}
}
log(sb.toString());
String[] stopTypes = new String[] { "<NUM>" };
Set<String> stopSet = asSet(stopTypes);
// with increments
StringReader reader = new StringReader(sb.toString());
final StandardTokenizer input = new StandardTokenizer();
input.setReader(reader);
TypeTokenFilter typeTokenFilter = new TypeTokenFilter(input, stopSet);
testPositons(typeTokenFilter);
}
Aggregations