Search in sources :

Example 1 with LmVocabulary

use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.

the class DistanceList method readFromBinary.

public static DistanceList readFromBinary(Path binFile, Path vocabFile) throws IOException {
    LmVocabulary vocabulary = LmVocabulary.loadFromBinary(vocabFile.toFile());
    try (DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(binFile.toFile()), 100000))) {
        int wordSize = in.readInt();
        int vectorSize = in.readInt();
        UIntMap<_Distance> distanceMap = new UIntMap<>(wordSize * 2);
        for (int i = 0; i < wordSize; i++) {
            int sourceWordIndex = in.readInt();
            int[] wordIndexes = new int[vectorSize];
            float[] scores = new float[vectorSize];
            for (int j = 0; j < vectorSize; j++) {
                wordIndexes[j] = in.readInt();
                scores[j] = in.readFloat();
            }
            if (i % 10000 == 0) {
                Log.info("%d completed", i);
            }
            distanceMap.put(sourceWordIndex, new _Distance(wordIndexes, scores));
        }
        return new DistanceList(vocabulary, distanceMap);
    }
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) BufferedInputStream(java.io.BufferedInputStream) UIntMap(zemberek.core.collections.UIntMap) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream)

Example 2 with LmVocabulary

use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.

the class WordVectorLookup method loadFromText.

public static WordVectorLookup loadFromText(Path txtFile, boolean skipFirstLine) throws IOException {
    List<String> lines = Files.readAllLines(txtFile);
    // generate vocabulary
    LmVocabulary.Builder builder = new LmVocabulary.Builder();
    int lineCount = 0;
    for (String line : lines) {
        if (lineCount++ == 0 && skipFirstLine) {
            // skip first line.
            continue;
        }
        int index = line.indexOf(' ');
        String word = line.substring(0, index);
        builder.add(word);
    }
    LmVocabulary vocabulary = builder.generate();
    UIntMap<Vector> vectors = new UIntMap<>(lines.size());
    lineCount = 0;
    for (String line : lines) {
        if (lineCount++ == 0 && skipFirstLine) {
            // skip first line.
            continue;
        }
        line = line.trim();
        int index = line.indexOf(' ');
        String word = line.substring(0, index);
        String floats = line.substring(index + 1);
        float[] vector = FloatArrays.fromString(floats, " ");
        int wordIndex = vocabulary.indexOf(word);
        vectors.put(wordIndex, new Vector(wordIndex, vector));
    }
    return new WordVectorLookup(vocabulary, vectors);
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) UIntMap(zemberek.core.collections.UIntMap)

Example 3 with LmVocabulary

use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.

the class SmoothLmTest method testProbWithBackoff.

@Test
public void testProbWithBackoff() throws IOException {
    SmoothLm lm = getTinyLm();
    LmVocabulary vocabulary = lm.getVocabulary();
    int ahmet = vocabulary.indexOf("Ahmet");
    int yemez = vocabulary.indexOf("yemez");
    // p(yemez|ahmet) = p(yemez) + b(ahmet) if p(yemez|ahmet) does not exist.
    double expected = -1.414973 + -0.316824;
    Assert.assertEquals(expected, lm.getProbability(ahmet, yemez), 0.0001);
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) Test(org.junit.Test)

Example 4 with LmVocabulary

use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.

the class SmoothLmTest method testExplain.

@Test
public void testExplain() throws IOException {
    SmoothLm lm = getTinyLm();
    LmVocabulary vocabulary = lm.getVocabulary();
    int[] is = { vocabulary.indexOf("<s>") };
    System.out.println(lm.explain(is));
    // <s> kedi
    int[] is2 = vocabulary.toIndexes("<s>", "kedi");
    System.out.println(lm.explain(is2));
    // Ahmet dondurma yedi
    int[] is3 = vocabulary.toIndexes("Ahmet", "dondurma", "yedi");
    System.out.println(lm.explain(is3));
    int[] is4 = vocabulary.toIndexes("Ahmet", "yemez");
    System.out.println(lm.explain(is4));
    int[] is5 = vocabulary.toIndexes("Ahmet", "yemez", "kırmızı");
    System.out.println(lm.explain(is5));
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) Test(org.junit.Test)

Example 5 with LmVocabulary

use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.

the class WordDistances method readFromBinary.

public static List<WordDistances> readFromBinary(Path binFile, Path vocabFile) throws IOException {
    LmVocabulary vocabulary = LmVocabulary.loadFromBinary(vocabFile.toFile());
    try (DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(binFile.toFile()), 100000))) {
        int wordSize = in.readInt();
        int vectorSize = in.readInt();
        List<WordDistances> distLists = new ArrayList<>(wordSize);
        for (int i = 0; i < wordSize; i++) {
            String s = vocabulary.getWord(in.readInt());
            Distance[] distances = new Distance[vectorSize];
            for (int j = 0; j < vectorSize; j++) {
                String word = vocabulary.getWord(in.readInt());
                Float f = in.readFloat();
                distances[j] = new Distance(word, f);
            }
            if (i % 10000 == 0) {
                Log.info("%d completed", i);
            }
            distLists.add(new WordDistances(s, distances));
        }
        return distLists;
    }
}
Also used : LmVocabulary(zemberek.lm.LmVocabulary) BufferedInputStream(java.io.BufferedInputStream) ArrayList(java.util.ArrayList) DataInputStream(java.io.DataInputStream) FileInputStream(java.io.FileInputStream)

Aggregations

LmVocabulary (zemberek.lm.LmVocabulary)15 Test (org.junit.Test)8 DataInputStream (java.io.DataInputStream)4 ArrayList (java.util.ArrayList)3 BufferedInputStream (java.io.BufferedInputStream)2 File (java.io.File)2 FileInputStream (java.io.FileInputStream)2 List (java.util.List)2 UIntMap (zemberek.core.collections.UIntMap)2 Resources (com.google.common.io.Resources)1 BufferedOutputStream (java.io.BufferedOutputStream)1 DataOutputStream (java.io.DataOutputStream)1 FileOutputStream (java.io.FileOutputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 RandomAccessFile (java.io.RandomAccessFile)1 MappedByteBuffer (java.nio.MappedByteBuffer)1 FileChannel (java.nio.channels.FileChannel)1 LinkedHashSet (java.util.LinkedHashSet)1 Set (java.util.Set)1