use of org.dkpro.tc.features.ngram.util.SkipNgramStringListIterable in project dkpro-tc by dkpro.
the class SkipWordNGramMC method getDocumentSkipNgrams.
public static FrequencyDistribution<String> getDocumentSkipNgrams(JCas jcas, Annotation anno, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, int skipN, Set<String> stopwords) {
FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, anno)) {
for (List<String> ngram : new SkipNgramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN, skipN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
documentNgrams.inc(ngramString);
}
}
}
return documentNgrams;
}
use of org.dkpro.tc.features.ngram.util.SkipNgramStringListIterable in project dkpro-tc by dkpro.
the class SkipCharacterNGramMC method getCharacterSkipNgrams.
public static FrequencyDistribution<String> getCharacterSkipNgrams(JCas jcas, Annotation target, boolean lowerCaseNGrams, int minN, int maxN, int skipN) {
FrequencyDistribution<String> charNgrams = new FrequencyDistribution<String>();
for (Token t : selectCovered(jcas, Token.class, target)) {
String tokenText = t.getCoveredText();
String[] charsTemp = tokenText.split("");
String[] chars = new String[charsTemp.length + 1];
for (int i = 0; i < charsTemp.length; i++) {
chars[i] = charsTemp[i];
}
chars[0] = "^";
chars[charsTemp.length] = "$";
for (List<String> ngram : new SkipNgramStringListIterable(chars, minN, maxN, skipN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
charNgrams.inc(ngramString);
}
}
return charNgrams;
}
Aggregations