use of org.apache.commons.codec.language.ColognePhonetic in project dkpro-tc by dkpro.
the class PhoneticNGramMC method getDocumentPhoneticNgrams.
public static FrequencyDistribution<String> getDocumentPhoneticNgrams(JCas jcas, Annotation target, int minN, int maxN) throws TextClassificationException {
StringEncoder encoder;
String languageCode = jcas.getDocumentLanguage();
if (languageCode.equals("en")) {
encoder = new Soundex();
} else if (languageCode.equals("de")) {
encoder = new ColognePhonetic();
} else {
throw new TextClassificationException("Language code '" + languageCode + "' not supported by phonetic ngrams FE.");
}
FrequencyDistribution<String> phoneticNgrams = new FrequencyDistribution<String>();
for (Sentence s : selectCovered(jcas, Sentence.class, target)) {
List<String> phoneticStrings = new ArrayList<String>();
for (Token t : selectCovered(jcas, Token.class, s)) {
try {
phoneticStrings.add(encoder.encode(t.getCoveredText()));
} catch (EncoderException e) {
throw new TextClassificationException(e);
}
}
String[] array = phoneticStrings.toArray(new String[phoneticStrings.size()]);
for (List<String> ngram : new NGramStringListIterable(array, minN, maxN)) {
phoneticNgrams.inc(StringUtils.join(ngram, NGRAM_GLUE));
}
}
return phoneticNgrams;
}
Aggregations