Search in sources :

Example 1 with EnglishStemmer

use of org.tartarus.snowball.ext.EnglishStemmer in project lucene-solr by apache.

the class TestSnowballPorterFilterFactory method test.

public void test() throws Exception {
    String text = "The fledgling banks were counting on a big boom in banking";
    EnglishStemmer stemmer = new EnglishStemmer();
    String[] test = text.split("\\s");
    String[] gold = new String[test.length];
    for (int i = 0; i < test.length; i++) {
        stemmer.setCurrent(test[i]);
        stemmer.stem();
        gold[i] = stemmer.getCurrent();
    }
    Reader reader = new StringReader(text);
    TokenStream stream = whitespaceMockTokenizer(reader);
    stream = tokenFilterFactory("SnowballPorter", "language", "English").create(stream);
    assertTokenStreamContents(stream, gold);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) StringReader(java.io.StringReader) StringReader(java.io.StringReader) Reader(java.io.Reader) EnglishStemmer(org.tartarus.snowball.ext.EnglishStemmer)

Example 2 with EnglishStemmer

use of org.tartarus.snowball.ext.EnglishStemmer in project snow-owl by b2ihealthcare.

the class ConceptSuggestionRequest method doExecute.

@Override
protected Suggestions doExecute(BranchContext context) throws IOException {
    TermFilter termFilter;
    if (containsKey(TERM)) {
        if (containsKey(MIN_OCCURENCE_COUNT)) {
            termFilter = TermFilter.minTermMatch(getString(TERM), (Integer) get(MIN_OCCURENCE_COUNT)).withIgnoreStopwords();
        } else {
            termFilter = TermFilter.defaultTermMatch(getString(TERM)).withIgnoreStopwords();
        }
    } else {
        // Gather tokens
        final Multiset<String> tokenOccurrences = HashMultiset.create();
        final EnglishStemmer stemmer = new EnglishStemmer();
        // Get the suggestion base set of concepts
        final ConceptSearchRequestBuilder baseRequestBuilder = new ConceptSearchRequestBuilder().filterByCodeSystemUri(context.service(ResourceURI.class)).setLimit(SCROLL_LIMIT).setLocales(locales());
        if (containsKey(QUERY)) {
            baseRequestBuilder.filterByInclusions(getCollection(QUERY, String.class));
        }
        if (containsKey(MUST_NOT_QUERY)) {
            baseRequestBuilder.filterByExclusions(getCollection(MUST_NOT_QUERY, String.class));
        }
        baseRequestBuilder.stream(context).flatMap(Concepts::stream).flatMap(concept -> getAllTerms(concept).stream()).map(term -> term.toLowerCase(Locale.US)).flatMap(lowerCaseTerm -> TOKEN_SPLITTER.splitToList(lowerCaseTerm).stream()).map(token -> stemToken(stemmer, token)).forEach(tokenOccurrences::add);
        topTokens = Multisets.copyHighestCountFirst(tokenOccurrences).elementSet().stream().filter(// skip short tokens
        token -> token.length() > 2).limit(topTokenCount).collect(Collectors.toList());
        int minShouldMatch = containsKey(MIN_OCCURENCE_COUNT) ? (Integer) get(MIN_OCCURENCE_COUNT) : DEFAULT_MIN_OCCURENCE_COUNT;
        termFilter = TermFilter.minTermMatch(topTokens.stream().collect(Collectors.joining(" ")), minShouldMatch);
    }
    /* 
		 * Run a search with the top tokens and minimum number of matches, excluding everything
		 * that was included previously.
		 */
    final Set<String> exclusions = newHashSet();
    exclusions.addAll(getCollection(QUERY, String.class));
    exclusions.addAll(getCollection(MUST_NOT_QUERY, String.class));
    final ConceptSearchRequestBuilder resultRequestBuilder = new ConceptSearchRequestBuilder().filterByCodeSystemUri(context.service(ResourceURI.class)).filterByActive(true).filterByTerm(termFilter).setPreferredDisplay(getString(DISPLAY)).setLimit(limit()).setLocales(locales()).setSearchAfter(searchAfter()).sortBy(sortBy());
    if (!exclusions.isEmpty()) {
        resultRequestBuilder.filterByExclusions(exclusions);
    }
    final Concepts conceptSuggestions = resultRequestBuilder.build().execute(context);
    return new Suggestions(topTokens, conceptSuggestions.getItems(), conceptSuggestions.getSearchAfter(), limit(), conceptSuggestions.getTotal());
}
Also used : Multiset(com.google.common.collect.Multiset) OptionKey(com.b2international.snowowl.core.request.ConceptSearchRequestEvaluator.OptionKey) Set(java.util.Set) IOException(java.io.IOException) Multisets(com.google.common.collect.Multisets) Min(javax.validation.constraints.Min) Collectors(java.util.stream.Collectors) TextConstants(com.b2international.index.compat.TextConstants) List(java.util.List) Concepts(com.b2international.snowowl.core.domain.Concepts) Concept(com.b2international.snowowl.core.domain.Concept) Suggestions(com.b2international.snowowl.core.domain.Suggestions) ImmutableList(com.google.common.collect.ImmutableList) HashMultiset(com.google.common.collect.HashMultiset) Locale(java.util.Locale) EnglishStemmer(org.tartarus.snowball.ext.EnglishStemmer) Sets.newHashSet(com.google.common.collect.Sets.newHashSet) BranchContext(com.b2international.snowowl.core.domain.BranchContext) Splitter(com.google.common.base.Splitter) ResourceURI(com.b2international.snowowl.core.ResourceURI) Suggestions(com.b2international.snowowl.core.domain.Suggestions) ResourceURI(com.b2international.snowowl.core.ResourceURI) Concepts(com.b2international.snowowl.core.domain.Concepts) EnglishStemmer(org.tartarus.snowball.ext.EnglishStemmer)

Aggregations

EnglishStemmer (org.tartarus.snowball.ext.EnglishStemmer)2 TextConstants (com.b2international.index.compat.TextConstants)1 ResourceURI (com.b2international.snowowl.core.ResourceURI)1 BranchContext (com.b2international.snowowl.core.domain.BranchContext)1 Concept (com.b2international.snowowl.core.domain.Concept)1 Concepts (com.b2international.snowowl.core.domain.Concepts)1 Suggestions (com.b2international.snowowl.core.domain.Suggestions)1 OptionKey (com.b2international.snowowl.core.request.ConceptSearchRequestEvaluator.OptionKey)1 Splitter (com.google.common.base.Splitter)1 HashMultiset (com.google.common.collect.HashMultiset)1 ImmutableList (com.google.common.collect.ImmutableList)1 Multiset (com.google.common.collect.Multiset)1 Multisets (com.google.common.collect.Multisets)1 Sets.newHashSet (com.google.common.collect.Sets.newHashSet)1 IOException (java.io.IOException)1 Reader (java.io.Reader)1 StringReader (java.io.StringReader)1 List (java.util.List)1 Locale (java.util.Locale)1 Set (java.util.Set)1