use of zemberek.lm.NgramLanguageModel in project zemberek-nlp by ahmetaa.
the class TurkishSpellCheckerTest method suggestWord1.
@Test
@Ignore("Slow. Uses actual data.")
public void suggestWord1() throws Exception {
TurkishMorphology morphology = TurkishMorphology.builder().addDictionaryLines("Türkiye", "Bayram").build();
List<String> endings = Lists.newArrayList("ında", "de");
StemEndingGraph graph = new StemEndingGraph(morphology, endings);
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology, graph.stemGraph);
NgramLanguageModel lm = getLm("lm-unigram.slm");
check(spellChecker, lm, "Türkiye'de", "Türkiye'de");
// TODO: "Bayramı'nda" fails.
}
use of zemberek.lm.NgramLanguageModel in project zemberek-nlp by ahmetaa.
the class TurkishSpellChecker method suggestForWord.
public List<String> suggestForWord(String word, String leftContext, String rightContext, NgramLanguageModel lm) {
List<String> unRanked = getUnrankedSuggestions(word);
if (lm.getOrder() < 2) {
Log.warn("Language model order is 1. For context ranking it should be at least 2. " + "Unigram ranking will be applied.");
return suggestForWord(word, lm);
}
LmVocabulary vocabulary = lm.getVocabulary();
List<ScoredItem<String>> results = new ArrayList<>(unRanked.size());
for (String str : unRanked) {
if (leftContext == null) {
leftContext = vocabulary.getSentenceStart();
} else {
leftContext = normalizeForLm(leftContext);
}
if (rightContext == null) {
rightContext = vocabulary.getSentenceEnd();
} else {
rightContext = normalizeForLm(rightContext);
}
String w = normalizeForLm(str);
int wordIndex = vocabulary.indexOf(w);
int leftIndex = vocabulary.indexOf(leftContext);
int rightIndex = vocabulary.indexOf(rightContext);
float score;
if (lm.getOrder() == 2) {
score = lm.getProbability(leftIndex, wordIndex) + lm.getProbability(wordIndex, rightIndex);
} else {
score = lm.getProbability(leftIndex, wordIndex, rightIndex);
}
results.add(new ScoredItem<>(str, score));
}
results.sort(ScoredItem.STRING_COMP_DESCENDING);
return results.stream().map(s -> s.item).collect(Collectors.toList());
}
use of zemberek.lm.NgramLanguageModel in project zemberek-nlp by ahmetaa.
the class TurkishSpellCheckerTest method runSentence.
@Test
@Ignore("Slow. Uses actual data.")
public void runSentence() throws Exception {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
NgramLanguageModel lm = getLm("lm-bigram.slm");
Path testInput = Paths.get(ClassLoader.getSystemResource("spell-checker-test-small.txt").toURI());
List<String> sentences = Files.readAllLines(testInput, StandardCharsets.UTF_8);
try (PrintWriter pw = new PrintWriter("bigram-test-result.txt")) {
for (String sentence : sentences) {
pw.println(sentence);
List<String> input = TurkishSpellChecker.tokenizeForSpelling(sentence);
for (int i = 0; i < input.size(); i++) {
String left = i == 0 ? null : input.get(i - 1);
String right = i == input.size() - 1 ? null : input.get(i + 1);
String word = input.get(i);
String deformed = applyDeformation(word);
List<String> res = spellChecker.suggestForWord(deformed, left, right, lm);
pw.println(String.format("%s %s[%s] %s -> %s", left, deformed, word, right, res.toString()));
}
pw.println();
}
}
}
use of zemberek.lm.NgramLanguageModel in project zemberek-nlp by ahmetaa.
the class TurkishSpellCheckerTest method suggestWordPerformanceStemEnding.
@Test
@Ignore("Slow. Uses actual data.")
public void suggestWordPerformanceStemEnding() throws Exception {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
NgramLanguageModel lm = getLm("lm-unigram.slm");
run(spellChecker, lm);
}
use of zemberek.lm.NgramLanguageModel in project zemberek-nlp by ahmetaa.
the class TurkishSpellCheckerTest method suggestWordPerformanceWord.
@Test
@Ignore("Slow. Uses actual data.")
public void suggestWordPerformanceWord() throws Exception {
TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
CharacterGraph graph = new CharacterGraph();
Path r = Paths.get(ClassLoader.getSystemResource("zemberek-parsed-words-min10.txt").toURI());
List<String> words = Files.readAllLines(r, StandardCharsets.UTF_8);
words.forEach(s -> graph.addWord(s, Node.TYPE_WORD));
TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology, graph);
NgramLanguageModel lm = getLm("lm-unigram.slm");
run(spellChecker, lm);
}
Aggregations