use of org.dkpro.tc.features.ngram.util.TermFreqTuple in project dkpro-tc by dkpro.
the class LuceneNGramPFE method getTopNgrams.
private FrequencyDistribution<String> getTopNgrams(int topNgramThreshold, String fieldName) throws ResourceInitializationException {
FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();
MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
IndexReader reader;
try {
reader = DirectoryReader.open(FSDirectory.open(luceneDir));
Fields fields = MultiFields.getFields(reader);
if (fields != null) {
Terms terms = fields.terms(fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
long freq = termsEnum.totalTermFreq();
topN.add(new TermFreqTuple(term, freq));
}
}
}
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
int size = topN.size();
for (int i = 0; i < size; i++) {
TermFreqTuple tuple = topN.poll();
// System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
}
return topNGrams;
}
use of org.dkpro.tc.features.ngram.util.TermFreqTuple in project dkpro-tc by dkpro.
the class LuceneFeatureExtractorBase method getTopNgrams.
@Override
protected FrequencyDistribution<String> getTopNgrams() throws ResourceInitializationException {
if (topNGrams != null && !forceRereadFromIndex) {
return topNGrams;
}
maxNgramSum = 0;
topN = readIndex();
topNGrams = new FrequencyDistribution<String>();
int size = topN.size();
for (int i = 0; i < size; i++) {
TermFreqTuple tuple = topN.poll();
long absCount = tuple.getFreq();
double relFrequency = ((double) absCount) / maxNgramSum;
if (relFrequency >= ngramFreqThreshold) {
topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
}
}
logSelectionProcess(topNGrams.getB());
return topNGrams;
}
use of org.dkpro.tc.features.ngram.util.TermFreqTuple in project dkpro-tc by dkpro.
the class LuceneFeatureExtractorBase method readIndex.
private MinMaxPriorityQueue<TermFreqTuple> readIndex() throws ResourceInitializationException {
MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(getTopN()).create();
IndexReader reader;
try {
reader = DirectoryReader.open(FSDirectory.open(luceneDir));
Fields fields = MultiFields.getFields(reader);
if (fields == null) {
IOUtils.closeQuietly(reader);
return topN;
}
Terms terms = fields.terms(getFieldName());
if (terms == null) {
IOUtils.closeQuietly(reader);
return topN;
}
TermsEnum termsEnum = terms.iterator(null);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
long freq = termsEnum.totalTermFreq();
if (passesScreening(term)) {
topN.add(new TermFreqTuple(term, freq));
maxNgramSum += freq;
}
}
reader.close();
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
return topN;
}
use of org.dkpro.tc.features.ngram.util.TermFreqTuple in project dkpro-tc by dkpro.
the class LuceneNGramCPFE method getTopNgramsCombo.
private FrequencyDistribution<String> getTopNgramsCombo(int topNgramThreshold, String fieldName) throws ResourceInitializationException {
FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();
MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
IndexReader reader;
try {
reader = DirectoryReader.open(FSDirectory.open(luceneDir));
Fields fields = MultiFields.getFields(reader);
if (fields != null) {
Terms terms = fields.terms(fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
long freq = termsEnum.totalTermFreq();
// add conditions here, like ngram1 is in most freq ngrams1...
String combo1 = term.split(ComboUtils.JOINT)[0];
String combo2 = term.split(ComboUtils.JOINT)[1];
int combinedSize = combo1.split("_").length + combo2.split("_").length;
if (topKSetView1.contains(combo1) && topKSet.contains(combo1) && topKSetView2.contains(combo2) && topKSet.contains(combo2) && combinedSize <= ngramMaxNCombo && combinedSize >= ngramMinNCombo) {
// print out here for testing
topN.add(new TermFreqTuple(term, freq));
}
}
}
}
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
int size = topN.size();
for (int i = 0; i < size; i++) {
TermFreqTuple tuple = topN.poll();
// System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
}
return topNGrams;
}
Aggregations