use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.
the class SmoothLmTest method testStupifBackoff.
@Test
public void testStupifBackoff() throws IOException {
File lmFile = getTinyLmFile();
SmoothLm lm = SmoothLm.builder(lmFile).useStupidBackoff().build();
LmVocabulary vocabulary = lm.getVocabulary();
int ahmet = vocabulary.indexOf("Ahmet");
int armut = vocabulary.indexOf("armut");
int kirmizi = vocabulary.indexOf("kırmızı");
// p(kirmizi | Ahmet,armut) = b(ahmet, armut) + p(kırmızı|armut) if initial trigram prob does not exist.
// if p(kırmızı|armut) also do not exist, we back off to b(ahmet, armut) + b(armut) + p(kırmızı)
double probKirmizi = -1.539912;
double expected = lm.getStupidBackoffLogAlpha() + lm.getStupidBackoffLogAlpha() + probKirmizi;
System.out.println("expected = " + expected);
System.out.println(lm.explain(ahmet, armut, kirmizi));
Assert.assertEquals(expected, lm.getProbability(ahmet, armut, kirmizi), 0.0001);
java.nio.file.Files.delete(lmFile.toPath());
}
use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.
the class SmoothLmTest method testBackoffcount.
@Test
public void testBackoffcount() throws IOException {
SmoothLm lm = getTinyLm();
LmVocabulary vocabulary = lm.getVocabulary();
int[] is = { vocabulary.indexOf("<s>") };
Assert.assertEquals(0, lm.getBackoffCount(is));
int[] is2 = vocabulary.toIndexes("<s>", "kedi");
Assert.assertEquals(0, lm.getBackoffCount(is2));
int[] is3 = vocabulary.toIndexes("Ahmet", "dondurma", "yedi");
Assert.assertEquals(0, lm.getBackoffCount(is3));
int[] is4 = vocabulary.toIndexes("Ahmet", "yemez");
Assert.assertEquals(1, lm.getBackoffCount(is4));
int[] is5 = vocabulary.toIndexes("Ahmet", "yemez", "kırmızı");
Assert.assertEquals(2, lm.getBackoffCount(is5));
}
use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.
the class SmoothLmTest method testTrigramBackoff.
@Test
public void testTrigramBackoff() throws IOException {
SmoothLm lm = getTinyLm();
LmVocabulary vocabulary = lm.getVocabulary();
int ahmet = vocabulary.indexOf("Ahmet");
int armut = vocabulary.indexOf("armut");
int kirmizi = vocabulary.indexOf("kırmızı");
// p(kirmizi | Ahmet,armut) = b(ahmet, armut) + p(kırmızı|armut) if initial trigram prob does not exist.
// if p(kırmızı|armut) also do not exist, we back off to b(ahmet, armut) + b(armut) + p(kırmızı)
double backoffAhmetArmut = -0.124939;
double backoffArmut = -0.492916;
double probKirmizi = -1.539912;
double expected = backoffAhmetArmut + backoffArmut + probKirmizi;
System.out.println("expected = " + expected);
System.out.println(lm.explain(ahmet, armut, kirmizi));
Assert.assertEquals(expected, lm.getProbability(ahmet, armut, kirmizi), 0.0001);
Assert.assertEquals(expected, lm.getTriGramProbability(ahmet, armut, kirmizi), 0.0001);
}
use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.
the class SmoothLmTest method testVocabulary.
@Test
public void testVocabulary() throws IOException {
SmoothLm lm = getTinyLm();
LmVocabulary vocab = lm.getVocabulary();
Assert.assertTrue(vocab.contains("Ahmet"));
int i1 = vocab.indexOf("Ahmet");
Assert.assertTrue(vocab.contains("elma"));
int i2 = vocab.indexOf("elma");
Assert.assertTrue(i1 != i2);
Assert.assertEquals("Ahmet", vocab.getWord(i1));
Assert.assertEquals("elma", vocab.getWord(i2));
}
use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.
the class SmoothLmTest method testLogBaseChange.
@Test
public void testLogBaseChange() throws IOException {
SmoothLm lm10 = getTinyLm();
System.out.println(lm10.info());
File lmFile = getTinyLmFile();
SmoothLm lm = SmoothLm.builder(lmFile).logBase(Math.E).build();
System.out.println(lm.info());
Assert.assertEquals(lm.getLogBase(), Math.E, 0.00001);
LmVocabulary vocabulary = lm.getVocabulary();
int[] is = { vocabulary.indexOf("<s>") };
Assert.assertEquals(l(-1.716003), lm.getProbabilityValue(is), 0.0001);
Assert.assertEquals(l(-1.716003), lm.getProbability(is), 0.0001);
// <s> kedi
int[] is2 = { vocabulary.indexOf("<s>"), vocabulary.indexOf("kedi") };
Assert.assertEquals(l(-0.796249), lm.getProbabilityValue(is2), 0.0001);
Assert.assertEquals(l(-0.796249), lm.getProbability(is2), 0.0001);
// Ahmet dondurma yedi
int[] is3 = { vocabulary.indexOf("Ahmet"), vocabulary.indexOf("dondurma"), vocabulary.indexOf("yedi") };
Assert.assertEquals(l(-0.602060), lm.getProbabilityValue(is3), 0.0001);
Assert.assertEquals(l(-0.602060), lm.getProbability(is3), 0.0001);
java.nio.file.Files.delete(lmFile.toPath());
}
Aggregations