use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class IdfPairMetaCollector method process.
@Override
public void process(JCas jcas) throws AnalysisEngineProcessException {
JCas view1;
JCas view2;
try {
view1 = jcas.getView(PART_ONE);
view2 = jcas.getView(PART_TWO);
} catch (Exception e) {
throw new AnalysisEngineProcessException(e);
}
FrequencyDistribution<String> document1NGrams;
FrequencyDistribution<String> document2NGrams;
try {
document1NGrams = getNgramsFD(view1);
document2NGrams = getNgramsFD(view2);
} catch (TextClassificationException e) {
throw new AnalysisEngineProcessException(e);
}
FrequencyDistribution<String> documentNGrams = new FrequencyDistribution<String>();
// This is different than other metacollectors.
for (String key : document1NGrams.getKeys()) {
documentNGrams.addSample(key, 1);
}
for (String key : document2NGrams.getKeys()) {
documentNGrams.addSample(key, 1);
}
for (String ngram : documentNGrams.getKeys()) {
for (int i = 0; i < documentNGrams.getCount(ngram); i++) {
Field field = new Field(getFieldName(), ngram, fieldType);
currentDocument.add(field);
}
}
try {
writeToIndex();
} catch (IOException e) {
throw new AnalysisEngineProcessException(e);
}
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class SkipCharacterNGramMC method getCharacterSkipNgrams.
public static FrequencyDistribution<String> getCharacterSkipNgrams(JCas jcas, Annotation target, boolean lowerCaseNGrams, int minN, int maxN, int skipN) {
FrequencyDistribution<String> charNgrams = new FrequencyDistribution<String>();
for (Token t : selectCovered(jcas, Token.class, target)) {
String tokenText = t.getCoveredText();
String[] charsTemp = tokenText.split("");
String[] chars = new String[charsTemp.length + 1];
for (int i = 0; i < charsTemp.length; i++) {
chars[i] = charsTemp[i];
}
chars[0] = "^";
chars[charsTemp.length] = "$";
for (List<String> ngram : new SkipNgramStringListIterable(chars, minN, maxN, skipN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
charNgrams.inc(ngramString);
}
}
return charNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class NGramUtils method getAnnotationNgrams.
public static FrequencyDistribution<String> getAnnotationNgrams(JCas jcas, Annotation focusAnnotation, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords) {
FrequencyDistribution<String> annoNgrams = new FrequencyDistribution<String>();
// if not, extract them from all tokens in the focusAnnotation
if (selectCovered(jcas, Sentence.class, focusAnnotation).size() > 0) {
for (Sentence s : selectCovered(jcas, Sentence.class, focusAnnotation)) {
for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, s)), minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
annoNgrams.inc(ngramString);
}
}
}
} else {
for (List<String> ngram : new NGramStringListIterable(toText(selectCovered(Token.class, focusAnnotation)), minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
annoNgrams.inc(ngramString);
}
}
}
return annoNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class LibsvmDataFormatBaselineMajorityClassIdReport method determineMajorityClass.
private void determineMajorityClass(File file) throws Exception {
FrequencyDistribution<String> fd = new FrequencyDistribution<>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
String line = null;
while ((line = reader.readLine()) != null) {
if (line.isEmpty()) {
continue;
}
String[] split = line.split("\t");
fd.addSample(split[0], 1);
}
} finally {
IOUtils.closeQuietly(reader);
}
majorityClass = fd.getSampleWithMaxFreq();
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class CrfSuiteBaselineMajorityClassIdReport method determineMajorityClass.
private void determineMajorityClass(File file) throws Exception {
FrequencyDistribution<String> fd = new FrequencyDistribution<>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
String line = null;
while ((line = reader.readLine()) != null) {
if (line.isEmpty()) {
continue;
}
String[] split = line.split("\t");
fd.addSample(split[0], 1);
}
} finally {
IOUtils.closeQuietly(reader);
}
majorityClass = fd.getSampleWithMaxFreq();
}
Aggregations