use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class NGramUtils method getDocumentNgrams.
/**
* Returns document ngrams over any annotation type that extends Annotation. Intended use is
* Lemma, Stem, etc.
*
* @param jcas
* a jcas
* @param aTarget
* target annotation span
* @param lowerCaseNGrams
* lower caseing
* @param filterPartialMatches
* filter partial matches
* @param minN
* minimal n
* @param maxN
* maximal n
* @param stopwords
* set of stopwords
* @param annotationClass
* annotation type of the ngram
* @return a frequency distribution
*
* @throws TextClassificationException
* when an exception occurs
*/
public static FrequencyDistribution<String> getDocumentNgrams(JCas jcas, Annotation aTarget, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords, Class<? extends Annotation> annotationClass) throws TextClassificationException {
FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, aTarget)) {
List<String> strings = valuesToText(jcas, s, annotationClass.getName());
for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
documentNgrams.inc(ngramString);
}
}
}
return documentNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class DeepLearningMajorityClass2OutcomeReport method determineMajorityClass.
private void determineMajorityClass(File f) throws Exception {
FrequencyDistribution<String> fd = new FrequencyDistribution<>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), "utf-8"));
String line = null;
while ((line = reader.readLine()) != null) {
String[] split = line.split(" ");
for (String v : split) {
fd.addSample(v, 1);
}
}
} finally {
IOUtils.closeQuietly(reader);
}
majorityClass = fd.getSampleWithMaxFreq();
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class PosNGramMC method sentenceBasedDistribution.
private static FrequencyDistribution<String> sentenceBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, focus)) {
List<String> postagstrings = new ArrayList<String>();
for (POS p : selectCovered(jcas, POS.class, s)) {
if (useCanonical) {
postagstrings.add(p.getClass().getSimpleName());
} else {
postagstrings.add(p.getPosValue());
}
}
String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return posNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class PosNGramMC method documentBasedDistribution.
private static FrequencyDistribution<String> documentBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
List<String> postagstrings = new ArrayList<String>();
for (POS p : selectCovered(jcas, POS.class, focus)) {
if (useCanonical) {
postagstrings.add(p.getClass().getSimpleName());
} else {
postagstrings.add(p.getPosValue());
}
}
String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
return posNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class SkipWordNGramMC method getDocumentSkipNgrams.
public static FrequencyDistribution<String> getDocumentSkipNgrams(JCas jcas, Annotation anno, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, int skipN, Set<String> stopwords) {
FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, anno)) {
for (List<String> ngram : new SkipNgramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN, skipN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
documentNgrams.inc(ngramString);
}
}
}
return documentNgrams;
}
Aggregations