Search in sources :

Example 1 with NgramLanguageModel

use of zemberek.lm.NgramLanguageModel in project zemberek-nlp by ahmetaa.

the class TurkishSpellCheckerTest method suggestWord1.

@Test
@Ignore("Slow. Uses actual data.")
public void suggestWord1() throws Exception {
    TurkishMorphology morphology = TurkishMorphology.builder().addDictionaryLines("Türkiye", "Bayram").build();
    List<String> endings = Lists.newArrayList("ında", "de");
    StemEndingGraph graph = new StemEndingGraph(morphology, endings);
    TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology, graph.stemGraph);
    NgramLanguageModel lm = getLm("lm-unigram.slm");
    check(spellChecker, lm, "Türkiye'de", "Türkiye'de");
// TODO: "Bayramı'nda" fails.
}
Also used : NgramLanguageModel(zemberek.lm.NgramLanguageModel) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 2 with NgramLanguageModel

use of zemberek.lm.NgramLanguageModel in project zemberek-nlp by ahmetaa.

the class TurkishSpellChecker method suggestForWord.

public List<String> suggestForWord(String word, String leftContext, String rightContext, NgramLanguageModel lm) {
    List<String> unRanked = getUnrankedSuggestions(word);
    if (lm.getOrder() < 2) {
        Log.warn("Language model order is 1. For context ranking it should be at least 2. " + "Unigram ranking will be applied.");
        return suggestForWord(word, lm);
    }
    LmVocabulary vocabulary = lm.getVocabulary();
    List<ScoredItem<String>> results = new ArrayList<>(unRanked.size());
    for (String str : unRanked) {
        if (leftContext == null) {
            leftContext = vocabulary.getSentenceStart();
        } else {
            leftContext = normalizeForLm(leftContext);
        }
        if (rightContext == null) {
            rightContext = vocabulary.getSentenceEnd();
        } else {
            rightContext = normalizeForLm(rightContext);
        }
        String w = normalizeForLm(str);
        int wordIndex = vocabulary.indexOf(w);
        int leftIndex = vocabulary.indexOf(leftContext);
        int rightIndex = vocabulary.indexOf(rightContext);
        float score;
        if (lm.getOrder() == 2) {
            score = lm.getProbability(leftIndex, wordIndex) + lm.getProbability(wordIndex, rightIndex);
        } else {
            score = lm.getProbability(leftIndex, wordIndex, rightIndex);
        }
        results.add(new ScoredItem<>(str, score));
    }
    results.sort(ScoredItem.STRING_COMP_DESCENDING);
    return results.stream().map(s -> s.item).collect(Collectors.toList());
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) Resources(com.google.common.io.Resources) Token(org.antlr.v4.runtime.Token) Set(java.util.Set) TurkishLexer(zemberek.tokenization.antlr.TurkishLexer) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) WordAnalysis(zemberek.morphology.analysis.WordAnalysis) ArrayList(java.util.ArrayList) WordAnalysisFormatter(zemberek.morphology.analysis.WordAnalysisFormatter) List(java.util.List) Turkish(zemberek.morphology.structure.Turkish) DummyLanguageModel(zemberek.lm.DummyLanguageModel) TurkishAlphabet(zemberek.core.turkish.TurkishAlphabet) SmoothLm(zemberek.lm.compression.SmoothLm) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) TurkishTokenizer(zemberek.tokenization.TurkishTokenizer) ScoredItem(zemberek.core.ScoredItem) Log(zemberek.core.logging.Log) NgramLanguageModel(zemberek.lm.NgramLanguageModel) LinkedHashSet(java.util.LinkedHashSet) InputStream(java.io.InputStream) LmVocabulary(zemberek.lm.LmVocabulary) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList)

Example 3 with NgramLanguageModel

use of zemberek.lm.NgramLanguageModel in project zemberek-nlp by ahmetaa.

the class TurkishSpellCheckerTest method runSentence.

@Test
@Ignore("Slow. Uses actual data.")
public void runSentence() throws Exception {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
    NgramLanguageModel lm = getLm("lm-bigram.slm");
    Path testInput = Paths.get(ClassLoader.getSystemResource("spell-checker-test-small.txt").toURI());
    List<String> sentences = Files.readAllLines(testInput, StandardCharsets.UTF_8);
    try (PrintWriter pw = new PrintWriter("bigram-test-result.txt")) {
        for (String sentence : sentences) {
            pw.println(sentence);
            List<String> input = TurkishSpellChecker.tokenizeForSpelling(sentence);
            for (int i = 0; i < input.size(); i++) {
                String left = i == 0 ? null : input.get(i - 1);
                String right = i == input.size() - 1 ? null : input.get(i + 1);
                String word = input.get(i);
                String deformed = applyDeformation(word);
                List<String> res = spellChecker.suggestForWord(deformed, left, right, lm);
                pw.println(String.format("%s %s[%s] %s -> %s", left, deformed, word, right, res.toString()));
            }
            pw.println();
        }
    }
}
Also used : Path(java.nio.file.Path) NgramLanguageModel(zemberek.lm.NgramLanguageModel) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) PrintWriter(java.io.PrintWriter) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 4 with NgramLanguageModel

use of zemberek.lm.NgramLanguageModel in project zemberek-nlp by ahmetaa.

the class TurkishSpellCheckerTest method suggestWordPerformanceStemEnding.

@Test
@Ignore("Slow. Uses actual data.")
public void suggestWordPerformanceStemEnding() throws Exception {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology);
    NgramLanguageModel lm = getLm("lm-unigram.slm");
    run(spellChecker, lm);
}
Also used : NgramLanguageModel(zemberek.lm.NgramLanguageModel) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Example 5 with NgramLanguageModel

use of zemberek.lm.NgramLanguageModel in project zemberek-nlp by ahmetaa.

the class TurkishSpellCheckerTest method suggestWordPerformanceWord.

@Test
@Ignore("Slow. Uses actual data.")
public void suggestWordPerformanceWord() throws Exception {
    TurkishMorphology morphology = TurkishMorphology.createWithDefaults();
    CharacterGraph graph = new CharacterGraph();
    Path r = Paths.get(ClassLoader.getSystemResource("zemberek-parsed-words-min10.txt").toURI());
    List<String> words = Files.readAllLines(r, StandardCharsets.UTF_8);
    words.forEach(s -> graph.addWord(s, Node.TYPE_WORD));
    TurkishSpellChecker spellChecker = new TurkishSpellChecker(morphology, graph);
    NgramLanguageModel lm = getLm("lm-unigram.slm");
    run(spellChecker, lm);
}
Also used : Path(java.nio.file.Path) NgramLanguageModel(zemberek.lm.NgramLanguageModel) TurkishMorphology(zemberek.morphology.analysis.tr.TurkishMorphology) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

NgramLanguageModel (zemberek.lm.NgramLanguageModel)5 TurkishMorphology (zemberek.morphology.analysis.tr.TurkishMorphology)5 Ignore (org.junit.Ignore)4 Test (org.junit.Test)4 Path (java.nio.file.Path)2 Resources (com.google.common.io.Resources)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 PrintWriter (java.io.PrintWriter)1 ArrayList (java.util.ArrayList)1 LinkedHashSet (java.util.LinkedHashSet)1 List (java.util.List)1 Set (java.util.Set)1 Collectors (java.util.stream.Collectors)1 Token (org.antlr.v4.runtime.Token)1 ScoredItem (zemberek.core.ScoredItem)1 Log (zemberek.core.logging.Log)1 TurkishAlphabet (zemberek.core.turkish.TurkishAlphabet)1 DummyLanguageModel (zemberek.lm.DummyLanguageModel)1 LmVocabulary (zemberek.lm.LmVocabulary)1