use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class LuceneNGramCPFE method getTopNgramsCombo.
private FrequencyDistribution<String> getTopNgramsCombo(int topNgramThreshold, String fieldName) throws ResourceInitializationException {
FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();
MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
IndexReader reader;
try {
reader = DirectoryReader.open(FSDirectory.open(luceneDir));
Fields fields = MultiFields.getFields(reader);
if (fields != null) {
Terms terms = fields.terms(fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
long freq = termsEnum.totalTermFreq();
// add conditions here, like ngram1 is in most freq ngrams1...
String combo1 = term.split(ComboUtils.JOINT)[0];
String combo2 = term.split(ComboUtils.JOINT)[1];
int combinedSize = combo1.split("_").length + combo2.split("_").length;
if (topKSetView1.contains(combo1) && topKSet.contains(combo1) && topKSetView2.contains(combo2) && topKSet.contains(combo2) && combinedSize <= ngramMaxNCombo && combinedSize >= ngramMinNCombo) {
// print out here for testing
topN.add(new TermFreqTuple(term, freq));
}
}
}
}
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
int size = topN.size();
for (int i = 0; i < size; i++) {
TermFreqTuple tuple = topN.poll();
// System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
}
return topNGrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class ComboUtils method getMultipleViewNgrams.
public static FrequencyDistribution<String> getMultipleViewNgrams(List<JCas> jcases, Annotation classificationUnit, boolean ngramLowerCase, boolean filterPartialStopwords, int ngramMinN, int ngramMaxN, Set<String> stopwords) throws TextClassificationException {
FrequencyDistribution<String> viewNgramsTotal = new FrequencyDistribution<String>();
for (JCas view : jcases) {
FrequencyDistribution<String> oneViewsNgrams = new FrequencyDistribution<String>();
TextClassificationTarget aTarget = JCasUtil.selectSingle(view, TextClassificationTarget.class);
if (classificationUnit == null) {
oneViewsNgrams = NGramUtils.getDocumentNgrams(view, aTarget, ngramLowerCase, filterPartialStopwords, ngramMinN, ngramMaxN, stopwords, Token.class);
} else {
oneViewsNgrams = NGramUtils.getAnnotationNgrams(view, classificationUnit, ngramLowerCase, filterPartialStopwords, ngramMinN, ngramMaxN, stopwords);
}
// This is a hack because there's no method to combine 2 FD's
for (String key : oneViewsNgrams.getKeys()) {
viewNgramsTotal.addSample(key, oneViewsNgrams.getCount(key));
}
}
return viewNgramsTotal;
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class WekaBaselineMajorityClassIdReport method determineMajorityClass.
private void determineMajorityClass(File file) throws Exception {
FrequencyDistribution<String> fd = new FrequencyDistribution<>();
BufferedReader reader = null;
try {
reader = new BufferedReader(new InputStreamReader(new FileInputStream(file), "utf-8"));
String line = null;
while ((line = reader.readLine()) != null) {
if (line.isEmpty() || line.startsWith("@")) {
continue;
}
String[] split = line.split(",");
String v = split[split.length - 1];
if (hasInstanceWeighting(v)) {
v = split[split.length - 2];
}
fd.addSample(v, 1);
}
} finally {
IOUtils.closeQuietly(reader);
}
majorityClass = fd.getSampleWithMaxFreq();
}
Aggregations