use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.
the class TestWordDelimiterFilter method testOffsetChange4.
@Test
public void testOffsetChange4() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(foo,bar)", 7, 16)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, new String[] { "foo", "foobar", "bar" }, new int[] { 8, 8, 12 }, new int[] { 11, 15, 15 });
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.
the class TestWordDelimiterFilter method testLotsOfConcatenating.
/** concat numbers + words + all */
public void testLotsOfConcatenating() throws Exception {
final int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_WORDS | CATENATE_NUMBERS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
/* analyzer that uses whitespace + wdf */
Analyzer a = new Analyzer() {
@Override
public TokenStreamComponents createComponents(String field) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
}
};
assertAnalyzesTo(a, "abc-def-123-456", new String[] { "abc", "abcdef", "abcdef123456", "def", "123", "123456", "456" }, new int[] { 0, 0, 0, 4, 8, 8, 12 }, new int[] { 3, 7, 15, 7, 11, 15, 15 }, null, new int[] { 1, 0, 0, 1, 1, 0, 1 }, null, false);
a.close();
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.
the class TestWordDelimiterFilter method testOnlyNumbers.
/*
public void testToDot() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE | PRESERVE_ORIGINAL | CATENATE_WORDS | CATENATE_NUMBERS | STEM_ENGLISH_POSSESSIVE;
String text = "PowerSystem2000-5-Shot's";
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token(text, 0, text.length())), DEFAULT_WORD_DELIM_TABLE, flags, null);
//StringWriter sw = new StringWriter();
// TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, new PrintWriter(sw));
PrintWriter pw = new PrintWriter("/x/tmp/before.dot");
TokenStreamToDot toDot = new TokenStreamToDot(text, wdf, pw);
toDot.toDot();
pw.close();
System.out.println("TEST DONE");
//System.out.println("DOT:\n" + sw.toString());
}
*/
public void testOnlyNumbers() throws Exception {
int flags = GENERATE_WORD_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS;
Analyzer a = new Analyzer() {
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
return new TokenStreamComponents(tokenizer, new WordDelimiterFilter(tokenizer, flags, null));
}
};
assertAnalyzesTo(a, "7-586", new String[] {}, new int[] {}, new int[] {}, null, new int[] {}, null, false);
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project lucene-solr by apache.
the class TestWordDelimiterFilter method testOffsetChange2.
@Test
public void testOffsetChange2() throws Exception {
int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | CATENATE_ALL | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE;
WordDelimiterFilter wdf = new WordDelimiterFilter(new CannedTokenStream(new Token("(übelkeit", 7, 17)), DEFAULT_WORD_DELIM_TABLE, flags, null);
assertTokenStreamContents(wdf, new String[] { "übelkeit" }, new int[] { 8 }, new int[] { 17 });
}
use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilter in project cogcomp-nlp by CogComp.
the class ASCIIEnglishAnalyzer method createComponents.
@Override
protected TokenStreamComponents createComponents(String fieldName) {
final Tokenizer source = new StandardTokenizer();
TokenStream result = new StandardFilter(source);
result = new ASCIIFoldingFilter(result);
result = new EnglishPossessiveFilter(result);
result = new WordDelimiterFilter(result, WordDelimiterFilter.ALPHA, null);
result = new LowerCaseFilter(result);
result = new StopFilter(result, EnglishAnalyzer.getDefaultStopSet());
result = new PorterStemFilter(result);
return new TokenStreamComponents(source, result);
}
Aggregations