use of org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory in project lucene-solr by apache.
the class TestWordDelimiterFilterFactory method testCustomTypes.
@Test
public void testCustomTypes() throws Exception {
String testText = "I borrowed $5,400.00 at 25% interest-rate";
ResourceLoader loader = new SolrResourceLoader(TEST_PATH().resolve("collection1"));
Map<String, String> args = new HashMap<>();
args.put("luceneMatchVersion", Version.LATEST.toString());
args.put("generateWordParts", "1");
args.put("generateNumberParts", "1");
args.put("catenateWords", "1");
args.put("catenateNumbers", "1");
args.put("catenateAll", "0");
args.put("splitOnCaseChange", "1");
/* default behavior */
WordDelimiterFilterFactory factoryDefault = new WordDelimiterFilterFactory(args);
factoryDefault.inform(loader);
TokenStream ts = factoryDefault.create(whitespaceMockTokenizer(testText));
BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "I", "borrowed", "5", "540000", "400", "00", "at", "25", "interest", "interestrate", "rate" });
ts = factoryDefault.create(whitespaceMockTokenizer("foobar"));
BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foo", "foobar", "bar" });
/* custom behavior */
args = new HashMap<>();
// use a custom type mapping
args.put("luceneMatchVersion", Version.LATEST.toString());
args.put("generateWordParts", "1");
args.put("generateNumberParts", "1");
args.put("catenateWords", "1");
args.put("catenateNumbers", "1");
args.put("catenateAll", "0");
args.put("splitOnCaseChange", "1");
args.put("types", "wdftypes.txt");
WordDelimiterFilterFactory factoryCustom = new WordDelimiterFilterFactory(args);
factoryCustom.inform(loader);
ts = factoryCustom.create(whitespaceMockTokenizer(testText));
BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "I", "borrowed", "$5,400.00", "at", "25%", "interest", "interestrate", "rate" });
/* test custom behavior with a char > 0x7F, because we had to make a larger byte[] */
ts = factoryCustom.create(whitespaceMockTokenizer("foobar"));
BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "foobar" });
}
Aggregations