use of org.tartarus.snowball.ext.EnglishStemmer in project lucene-solr by apache.
the class TestSnowballPorterFilterFactory method test.
public void test() throws Exception {
String text = "The fledgling banks were counting on a big boom in banking";
EnglishStemmer stemmer = new EnglishStemmer();
String[] test = text.split("\\s");
String[] gold = new String[test.length];
for (int i = 0; i < test.length; i++) {
stemmer.setCurrent(test[i]);
stemmer.stem();
gold[i] = stemmer.getCurrent();
}
Reader reader = new StringReader(text);
TokenStream stream = whitespaceMockTokenizer(reader);
stream = tokenFilterFactory("SnowballPorter", "language", "English").create(stream);
assertTokenStreamContents(stream, gold);
}
use of org.tartarus.snowball.ext.EnglishStemmer in project snow-owl by b2ihealthcare.
the class ConceptSuggestionRequest method doExecute.
@Override
protected Suggestions doExecute(BranchContext context) throws IOException {
TermFilter termFilter;
if (containsKey(TERM)) {
if (containsKey(MIN_OCCURENCE_COUNT)) {
termFilter = TermFilter.minTermMatch(getString(TERM), (Integer) get(MIN_OCCURENCE_COUNT)).withIgnoreStopwords();
} else {
termFilter = TermFilter.defaultTermMatch(getString(TERM)).withIgnoreStopwords();
}
} else {
// Gather tokens
final Multiset<String> tokenOccurrences = HashMultiset.create();
final EnglishStemmer stemmer = new EnglishStemmer();
// Get the suggestion base set of concepts
final ConceptSearchRequestBuilder baseRequestBuilder = new ConceptSearchRequestBuilder().filterByCodeSystemUri(context.service(ResourceURI.class)).setLimit(SCROLL_LIMIT).setLocales(locales());
if (containsKey(QUERY)) {
baseRequestBuilder.filterByInclusions(getCollection(QUERY, String.class));
}
if (containsKey(MUST_NOT_QUERY)) {
baseRequestBuilder.filterByExclusions(getCollection(MUST_NOT_QUERY, String.class));
}
baseRequestBuilder.stream(context).flatMap(Concepts::stream).flatMap(concept -> getAllTerms(concept).stream()).map(term -> term.toLowerCase(Locale.US)).flatMap(lowerCaseTerm -> TOKEN_SPLITTER.splitToList(lowerCaseTerm).stream()).map(token -> stemToken(stemmer, token)).forEach(tokenOccurrences::add);
topTokens = Multisets.copyHighestCountFirst(tokenOccurrences).elementSet().stream().filter(// skip short tokens
token -> token.length() > 2).limit(topTokenCount).collect(Collectors.toList());
int minShouldMatch = containsKey(MIN_OCCURENCE_COUNT) ? (Integer) get(MIN_OCCURENCE_COUNT) : DEFAULT_MIN_OCCURENCE_COUNT;
termFilter = TermFilter.minTermMatch(topTokens.stream().collect(Collectors.joining(" ")), minShouldMatch);
}
/*
* Run a search with the top tokens and minimum number of matches, excluding everything
* that was included previously.
*/
final Set<String> exclusions = newHashSet();
exclusions.addAll(getCollection(QUERY, String.class));
exclusions.addAll(getCollection(MUST_NOT_QUERY, String.class));
final ConceptSearchRequestBuilder resultRequestBuilder = new ConceptSearchRequestBuilder().filterByCodeSystemUri(context.service(ResourceURI.class)).filterByActive(true).filterByTerm(termFilter).setPreferredDisplay(getString(DISPLAY)).setLimit(limit()).setLocales(locales()).setSearchAfter(searchAfter()).sortBy(sortBy());
if (!exclusions.isEmpty()) {
resultRequestBuilder.filterByExclusions(exclusions);
}
final Concepts conceptSuggestions = resultRequestBuilder.build().execute(context);
return new Suggestions(topTokens, conceptSuggestions.getItems(), conceptSuggestions.getSearchAfter(), limit(), conceptSuggestions.getTotal());
}
Aggregations