use of zemberek.normalization.CharacterGraphDecoder in project zemberek-nlp by ahmetaa.
the class DictionaryOperations method findBadDictionaryItems.
private static void findBadDictionaryItems() throws IOException {
CharacterGraphDecoder decoder = new CharacterGraphDecoder(0f);
CharMatcher matcher = CharacterGraphDecoder.DIACRITICS_IGNORING_MATCHER;
List<String> words = TextIO.loadLinesFromResource("tr/proper-from-corpus.dict", "#").stream().map(s -> s.trim().replaceAll("[ ]+.+?$", "").toLowerCase(Turkish.LOCALE)).collect(Collectors.toList());
decoder.addWords(words);
Set<String> res = new LinkedHashSet<>();
for (String word : words) {
if (word.length() < 5) {
continue;
}
List<String> matches = decoder.getSuggestions(word, matcher);
// matches.sort(Turkish.STRING_COMPARATOR_ASC);
String s = String.join(" ", matches);
if (matches.size() > 1) {
res.add(word + " - " + s);
}
}
List<String> r = new ArrayList<>(res);
r.sort(Turkish.STRING_COMPARATOR_ASC);
Files.write(Paths.get("similar-words-0-distance"), r);
}
Aggregations