Search in sources :

Example 6 with LmVocabulary

use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.

the class SmoothLmTest method testStupifBackoff.

@Test
public void testStupifBackoff() throws IOException {
    File lmFile = getTinyLmFile();
    SmoothLm lm = SmoothLm.builder(lmFile).useStupidBackoff().build();
    LmVocabulary vocabulary = lm.getVocabulary();
    int ahmet = vocabulary.indexOf("Ahmet");
    int armut = vocabulary.indexOf("armut");
    int kirmizi = vocabulary.indexOf("kırmızı");
    // p(kirmizi | Ahmet,armut) = b(ahmet, armut) + p(kırmızı|armut) if initial trigram prob does not exist.
    // if p(kırmızı|armut) also do not exist, we back off to b(ahmet, armut) + b(armut) + p(kırmızı)
    double probKirmizi = -1.539912;
    double expected = lm.getStupidBackoffLogAlpha() + lm.getStupidBackoffLogAlpha() + probKirmizi;
    System.out.println("expected = " + expected);
    System.out.println(lm.explain(ahmet, armut, kirmizi));
    Assert.assertEquals(expected, lm.getProbability(ahmet, armut, kirmizi), 0.0001);
    java.nio.file.Files.delete(lmFile.toPath());
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) File(java.io.File) Test(org.junit.Test)

Example 7 with LmVocabulary

use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.

the class SmoothLmTest method testBackoffcount.

@Test
public void testBackoffcount() throws IOException {
    SmoothLm lm = getTinyLm();
    LmVocabulary vocabulary = lm.getVocabulary();
    int[] is = { vocabulary.indexOf("<s>") };
    Assert.assertEquals(0, lm.getBackoffCount(is));
    int[] is2 = vocabulary.toIndexes("<s>", "kedi");
    Assert.assertEquals(0, lm.getBackoffCount(is2));
    int[] is3 = vocabulary.toIndexes("Ahmet", "dondurma", "yedi");
    Assert.assertEquals(0, lm.getBackoffCount(is3));
    int[] is4 = vocabulary.toIndexes("Ahmet", "yemez");
    Assert.assertEquals(1, lm.getBackoffCount(is4));
    int[] is5 = vocabulary.toIndexes("Ahmet", "yemez", "kırmızı");
    Assert.assertEquals(2, lm.getBackoffCount(is5));
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) Test(org.junit.Test)

Example 8 with LmVocabulary

use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.

the class SmoothLmTest method testTrigramBackoff.

@Test
public void testTrigramBackoff() throws IOException {
    SmoothLm lm = getTinyLm();
    LmVocabulary vocabulary = lm.getVocabulary();
    int ahmet = vocabulary.indexOf("Ahmet");
    int armut = vocabulary.indexOf("armut");
    int kirmizi = vocabulary.indexOf("kırmızı");
    // p(kirmizi | Ahmet,armut) = b(ahmet, armut) + p(kırmızı|armut) if initial trigram prob does not exist.
    // if p(kırmızı|armut) also do not exist, we back off to b(ahmet, armut) + b(armut) + p(kırmızı)
    double backoffAhmetArmut = -0.124939;
    double backoffArmut = -0.492916;
    double probKirmizi = -1.539912;
    double expected = backoffAhmetArmut + backoffArmut + probKirmizi;
    System.out.println("expected = " + expected);
    System.out.println(lm.explain(ahmet, armut, kirmizi));
    Assert.assertEquals(expected, lm.getProbability(ahmet, armut, kirmizi), 0.0001);
    Assert.assertEquals(expected, lm.getTriGramProbability(ahmet, armut, kirmizi), 0.0001);
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) Test(org.junit.Test)

Example 9 with LmVocabulary

use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.

the class SmoothLmTest method testVocabulary.

@Test
public void testVocabulary() throws IOException {
    SmoothLm lm = getTinyLm();
    LmVocabulary vocab = lm.getVocabulary();
    Assert.assertTrue(vocab.contains("Ahmet"));
    int i1 = vocab.indexOf("Ahmet");
    Assert.assertTrue(vocab.contains("elma"));
    int i2 = vocab.indexOf("elma");
    Assert.assertTrue(i1 != i2);
    Assert.assertEquals("Ahmet", vocab.getWord(i1));
    Assert.assertEquals("elma", vocab.getWord(i2));
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) Test(org.junit.Test)

Example 10 with LmVocabulary

use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.

the class SmoothLmTest method testLogBaseChange.

@Test
public void testLogBaseChange() throws IOException {
    SmoothLm lm10 = getTinyLm();
    System.out.println(lm10.info());
    File lmFile = getTinyLmFile();
    SmoothLm lm = SmoothLm.builder(lmFile).logBase(Math.E).build();
    System.out.println(lm.info());
    Assert.assertEquals(lm.getLogBase(), Math.E, 0.00001);
    LmVocabulary vocabulary = lm.getVocabulary();
    int[] is = { vocabulary.indexOf("<s>") };
    Assert.assertEquals(l(-1.716003), lm.getProbabilityValue(is), 0.0001);
    Assert.assertEquals(l(-1.716003), lm.getProbability(is), 0.0001);
    // <s> kedi
    int[] is2 = { vocabulary.indexOf("<s>"), vocabulary.indexOf("kedi") };
    Assert.assertEquals(l(-0.796249), lm.getProbabilityValue(is2), 0.0001);
    Assert.assertEquals(l(-0.796249), lm.getProbability(is2), 0.0001);
    // Ahmet dondurma yedi
    int[] is3 = { vocabulary.indexOf("Ahmet"), vocabulary.indexOf("dondurma"), vocabulary.indexOf("yedi") };
    Assert.assertEquals(l(-0.602060), lm.getProbabilityValue(is3), 0.0001);
    Assert.assertEquals(l(-0.602060), lm.getProbability(is3), 0.0001);
    java.nio.file.Files.delete(lmFile.toPath());
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) File(java.io.File) Test(org.junit.Test)

Aggregations

LmVocabulary (zemberek.lm.LmVocabulary)15 Test (org.junit.Test)8 DataInputStream (java.io.DataInputStream)4 ArrayList (java.util.ArrayList)3 BufferedInputStream (java.io.BufferedInputStream)2 File (java.io.File)2 FileInputStream (java.io.FileInputStream)2 List (java.util.List)2 UIntMap (zemberek.core.collections.UIntMap)2 Resources (com.google.common.io.Resources)1 BufferedOutputStream (java.io.BufferedOutputStream)1 DataOutputStream (java.io.DataOutputStream)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 RandomAccessFile (java.io.RandomAccessFile)1 MappedByteBuffer (java.nio.MappedByteBuffer)1 FileChannel (java.nio.channels.FileChannel)1 LinkedHashSet (java.util.LinkedHashSet)1 Set (java.util.Set)1