use of zemberek.core.turkish.TurkishAlphabet in project zemberek-nlp by ahmetaa.
the class DictionaryOperations method findAbbreviations.
public static void findAbbreviations() throws IOException {
// TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
RootLexicon lexicon = TurkishDictionaryLoader.loadFromResources("tr/non-tdk.dict");
Set<String> set = new HashSet<>();
for (DictionaryItem item : lexicon) {
String lemma = item.lemma;
if (item.attributes.contains(RootAttribute.Dummy)) {
continue;
}
if (item.secondaryPos != SecondaryPos.ProperNoun) {
continue;
}
TurkishAlphabet alphabet = TurkishAlphabet.INSTANCE;
if (!alphabet.containsVowel(lemma) || (lemma.length() > 3 && !alphabet.containsVowel(lemma.substring(0, 3)))) {
set.add(lemma + " [P:Abbrv]");
}
}
List<String> list = new ArrayList<>(set);
list.sort(Turkish.STRING_COMPARATOR_ASC);
Files.write(Paths.get("zemberek.possible.abbrv2"), list);
}
Aggregations