use of de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable in project dkpro-tc by dkpro.
the class KeywordNGramUtils method getDocumentKeywordNgrams.
// all tokens should be already lowercased
/**
* Finds all minN- to maxN-length ngrams of tokens occurring in the keyword list. All tokens
* should already be lowercased, if applicable. The keyword list can contain multi-token words
* like "Brussel sprouts". If keyword list contains both "Brussel" and "Brussel sprouts", then
* only "Brussel sprouts" will be added. Otherwise, the smallest multiword matching keyword will
* be added.
*
* @param jcas
* a jcas
* @param anno
* the annotation
* @param minN
* minimum ngram length
* @param maxN
* maximum ngram length
* @param markSentenceBoundary
* mark the boundary of a sentence
* @param markSentenceLocation
* mark the location of a sentence
* @param includeCommas
* include commas
* @param keywords
* list of keywords
* @return all ngrams of keywords in jcas
*/
public static FrequencyDistribution<String> getDocumentKeywordNgrams(JCas jcas, Annotation anno, int minN, int maxN, boolean markSentenceBoundary, boolean markSentenceLocation, boolean includeCommas, Set<String> keywords) {
FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
List<String> keywordList = new ArrayList<String>();
int sentenceNumber = 0;
int totalSentences = selectCovered(jcas, Sentence.class, anno).size();
for (Sentence s : selectCovered(jcas, Sentence.class, anno)) {
List<Token> sentence = selectCovered(Token.class, s);
for (int tokenpointer = 0; tokenpointer < sentence.size(); tokenpointer++) {
String token = sentence.get(tokenpointer).getCoveredText();
token = token.toLowerCase();
String compositeNgram = "";
boolean foundComposite = false;
for (int i = tokenpointer; i >= 0; i--) {
compositeNgram = sentence.get(i).getCoveredText().toLowerCase() + " " + compositeNgram;
if (compositeNgram.endsWith(" ")) {
compositeNgram = compositeNgram.replace(" ", "");
}
if (keywords.contains(compositeNgram)) {
keywordList.add(compositeNgram.replace(" ", MIDNGRAMGLUE));
foundComposite = true;
}
}
if (!foundComposite && keywords.contains(token)) {
keywordList.add(token);
} else if (includeCommas && token.equals(",")) {
keywordList.add(COMMA);
}
}
String sentenceBoundary = SENTENCE_BOUNDARY;
if (markSentenceLocation) {
if (((double) sentenceNumber / totalSentences) < 0.25) {
sentenceBoundary = sentenceBoundary + "BEG";
} else if (((double) sentenceNumber / totalSentences) > 0.75) {
sentenceBoundary = sentenceBoundary + "END";
} else {
sentenceBoundary = sentenceBoundary + "MID";
}
}
if (markSentenceBoundary) {
keywordList.add(sentenceBoundary);
}
sentenceNumber++;
}
for (List<String> ngram : new NGramStringListIterable(keywordList.toArray(new String[keywordList.size()]), minN, maxN)) {
String ngramString = StringUtils.join(ngram, GLUE);
documentNgrams.inc(ngramString);
}
return documentNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable in project dkpro-tc by dkpro.
the class PhoneticNGramMC method getDocumentPhoneticNgrams.
public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, Annotation target, int minN, int maxN) throws TextClassificationException {
StringEncoder encoder;
String languageCode = jcas.getDocumentLanguage();
if (languageCode.equals("en")) {
encoder = new Soundex();
} else if (languageCode.equals("de")) {
encoder = new ColognePhonetic();
} else {
throw new TextClassificationException("Language code '" + languageCode + "' not supported by phonetic ngrams FE.");
}
FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, target)) {
List<String> phoneticStrings = new ArrayList<String>();
for (Token t : selectCovered(jcas, Token.class, s)) {
try {
phoneticStrings.add(encoder.encode(t.getCoveredText()));
} catch (EncoderException e) {
throw new TextClassificationException(e);
}
}
String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);
for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) {
phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return phoneticNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable in project dkpro-tc by dkpro.
the class NGramUtils method getDocumentNgrams.
/**
* Returns document ngrams over any annotation type that extends Annotation. Intended use is
* Lemma, Stem, etc.
*
* @param jcas
* a jcas
* @param aTarget
* target annotation span
* @param lowerCaseNGrams
* lower caseing
* @param filterPartialMatches
* filter partial matches
* @param minN
* minimal n
* @param maxN
* maximal n
* @param stopwords
* set of stopwords
* @param annotationClass
* annotation type of the ngram
* @return a frequency distribution
*
* @throws TextClassificationException
* when an exception occurs
*/
public static FrequencyDistribution<String> getDocumentNgrams(JCas jcas, Annotation aTarget, boolean lowerCaseNGrams, boolean filterPartialMatches, int minN, int maxN, Set<String> stopwords, Class<? extends Annotation> annotationClass) throws TextClassificationException {
FrequencyDistribution<String> documentNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, aTarget)) {
List<String> strings = valuesToText(jcas, s, annotationClass.getName());
for (List<String> ngram : new NGramStringListIterable(strings, minN, maxN)) {
if (lowerCaseNGrams) {
ngram = lower(ngram);
}
if (passesNgramFilter(ngram, stopwords, filterPartialMatches)) {
String ngramString = StringUtils.join(ngram, NGRAM_GLUE);
documentNgrams.inc(ngramString);
}
}
}
return documentNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable in project dkpro-tc by dkpro.
the class PosNGramMC method sentenceBasedDistribution.
private static FrequencyDistribution<String> sentenceBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, focus)) {
List<String> postagstrings = new ArrayList<String>();
for (POS p : selectCovered(jcas, POS.class, s)) {
if (useCanonical) {
postagstrings.add(p.getClass().getSimpleName());
} else {
postagstrings.add(p.getPosValue());
}
}
String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return posNgrams;
}
use of de.tudarmstadt.ukp.dkpro.core.ngrams.util.NGramStringListIterable in project dkpro-tc by dkpro.
the class PosNGramMC method documentBasedDistribution.
private static FrequencyDistribution<String> documentBasedDistribution(JCas jcas, Annotation focus, boolean useCanonical, int minN, int maxN) {
FrequencyDistribution<String> posNgrams = new FrequencyDistribution<String>();
List<String> postagstrings = new ArrayList<String>();
for (POS p : selectCovered(jcas, POS.class, focus)) {
if (useCanonical) {
postagstrings.add(p.getClass().getSimpleName());
} else {
postagstrings.add(p.getPosValue());
}
}
String[] posarray = postagstrings.toArray(new String[postagstrings.size()]);
for (List<String> ngram : new NGramStringListIterable(posarray, minN, maxN)) {
posNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
return posNgrams;
}
Aggregations