use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class KeywordNGramUtils method getMultipleViewKeywordNgrams.
public static FrequencyDistribution<String> getMultipleViewKeywordNgrams(List<JCas> jcases, int minN, int maxN, boolean markSentenceBoundary, boolean markSentenceLocation, boolean includeCommas, Set<String> keywords) {
FrequencyDistribution<String> viewNgramsTotal = new FrequencyDistribution<String>();
for (JCas view : jcases) {
TextClassificationTarget aTarget = JCasUtil.selectSingle(view, TextClassificationTarget.class);
FrequencyDistribution<String> oneViewsNgrams = getDocumentKeywordNgrams(view, aTarget, minN, maxN, markSentenceBoundary, markSentenceLocation, includeCommas, keywords);
// This is a hack because there's no method to combine 2 FD's
for (String key : oneViewsNgrams.getKeys()) {
viewNgramsTotal.addSample(key, oneViewsNgrams.getCount(key));
}
}
return viewNgramsTotal;
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class KeywordNGramUtils method getDocumentKeywordNgrams.
// all tokens should be already lowercased
/**
* Finds all minN- to maxN-length ngrams of tokens occurring in the keyword list. All tokens
* should already be lowercased, if applicable. The keyword list can contain multi-token words
* like "Brussel sprouts". If keyword list contains both "Brussel" and "Brussel sprouts", then
* only "Brussel sprouts" will be added. Otherwise, the smallest multiword matching keyword will
* be added.
*
* @param jcas
* a jcas
* @param anno
* the annotation
* @param minN
* minimum ngram length
* @param maxN
* maximum ngram length
* @param markSentenceBoundary
* mark the boundary of a sentence
* @param markSentenceLocation
* mark the location of a sentence
* @param includeCommas
* include commas
* @param keywords
* list of keywords
* @return all ngrams of keywords in jcas
*/
public static FrequencyDistribution<String> getDocumentKeywordNgrams(JCas jcas, Annotation anno, int minN, int maxN, boolean markSentenceBoundary, boolean markSentenceLocation, boolean includeCommas, Set<String> keywords) {
FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
List<String> keywordList = new ArrayList<String>();
int sentenceNumber = 0;
int totalSentences = selectCovered(jcas, Sentence.class, anno).size();
for (Sentence s : selectCovered(jcas, Sentence.class, anno)) {
List<Token> sentence = selectCovered(Token.class, s);
for (int tokenpointer = 0; tokenpointer < sentence.size(); tokenpointer++) {
String token = sentence.get(tokenpointer).getCoveredText();
token = token.toLowerCase();
String compositeNgram = "";
boolean foundComposite = false;
for (int i = tokenpointer; i >= 0; i--) {
compositeNgram = sentence.get(i).getCoveredText().toLowerCase() + " " + compositeNgram;
if (compositeNgram.endsWith(" ")) {
compositeNgram = compositeNgram.replace(" ", "");
}
if (keywords.contains(compositeNgram)) {
keywordList.add(compositeNgram.replace(" ", MIDNGRAMGLUE));
foundComposite = true;
}
}
if (!foundComposite && keywords.contains(token)) {
keywordList.add(token);
} else if (includeCommas && token.equals(",")) {
keywordList.add(COMMA);
}
}
String sentenceBoundary = SENTENCE_BOUNDARY;
if (markSentenceLocation) {
if (((double) sentenceNumber / totalSentences) < 0.25) {
sentenceBoundary = sentenceBoundary + "BEG";
} else if (((double) sentenceNumber / totalSentences) > 0.75) {
sentenceBoundary = sentenceBoundary + "END";
} else {
sentenceBoundary = sentenceBoundary + "MID";
}
}
if (markSentenceBoundary) {
keywordList.add(sentenceBoundary);
}
sentenceNumber++;
}
for (List<String> ngram : new NGramStringListIterable(keywordList.toArray(new String[keywordList.size()]), minN, maxN)) {
String ngramString = StringUtils.join(ngram, GLUE);
documentNgrams.inc(ngramString);
}
return documentNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class LuceneNGramPFE method getTopNgrams.
private FrequencyDistribution<String> getTopNgrams(int topNgramThreshold, String fieldName) throws ResourceInitializationException {
FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();
MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
IndexReader reader;
try {
reader = DirectoryReader.open(FSDirectory.open(luceneDir));
Fields fields = MultiFields.getFields(reader);
if (fields != null) {
Terms terms = fields.terms(fieldName);
if (terms != null) {
TermsEnum termsEnum = terms.iterator(null);
BytesRef text = null;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
long freq = termsEnum.totalTermFreq();
topN.add(new TermFreqTuple(term, freq));
}
}
}
} catch (Exception e) {
throw new ResourceInitializationException(e);
}
int size = topN.size();
for (int i = 0; i < size; i++) {
TermFreqTuple tuple = topN.poll();
// System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
}
return topNGrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class PhoneticNGramMC method getDocumentPhoneticNgrams.
public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, Annotation target, int minN, int maxN) throws TextClassificationException {
StringEncoder encoder;
String languageCode = jcas.getDocumentLanguage();
if (languageCode.equals("en")) {
encoder = new Soundex();
} else if (languageCode.equals("de")) {
encoder = new ColognePhonetic();
} else {
throw new TextClassificationException("Language code '" + languageCode + "' not supported by phonetic ngrams FE.");
}
FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, target)) {
List<String> phoneticStrings = new ArrayList<String>();
for (Token t : selectCovered(jcas, Token.class, s)) {
try {
phoneticStrings.add(encoder.encode(t.getCoveredText()));
} catch (EncoderException e) {
throw new TextClassificationException(e);
}
}
String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);
for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) {
phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return phoneticNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.api.frequency.util.FrequencyDistribution in project dkpro-tc by dkpro.
the class MaxTokenLenMC method getNgramsFD.
@Override
protected FrequencyDistribution<String> getNgramsFD(JCas jcas) throws TextClassificationException {
FrequencyDistribution<String> fd = new FrequencyDistribution<>();
Collection<Token> select = JCasUtil.select(jcas, Token.class);
for (Token t : select) {
fd.addSample(t.getCoveredText().length() + "_" + r.nextInt(), t.getCoveredText().length());
}
return fd;
}
Aggregations