use of de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable in project dkpro-tc by dkpro.
the class NGramUtils method getAnnotationNgrams.
public static FrequencyDistribution<String> getAnnotationNgrams(JCas jcas, Annotation focusAnnotation, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords) {
FrequencyDistribution<String> annoNgrams = new FrequencyDistribution<String>();
// if not, extract them from all tokens in the focusAnnotation
if (selectCovered(jcas, Sentence.class, focusAnnotation).size() > 0) {
for (Sentence s : selectCovered(jcas, Sentence.class, focusAnnotation)) {
for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
annoNgrams.inc(ngramString);
}
}
}
} else {
for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, focusAnnotation)), minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
annoNgrams.inc(ngramString);
}
}
}
return annoNgrams;
}
Aggregations