use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.
the class DistanceList method readFromBinary.
public static DistanceList readFromBinary(Path binFile, Path vocabFile) throws IOException {
LmVocabulary vocabulary = LmVocabulary.loadFromBinary(vocabFile.toFile());
try (DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(binFile.toFile()), 100000))) {
int wordSize = in.readInt();
int vectorSize = in.readInt();
UIntMap<_Distance> distanceMap = new UIntMap<>(wordSize * 2);
for (int i = 0; i < wordSize; i++) {
int sourceWordIndex = in.readInt();
int[] wordIndexes = new int[vectorSize];
float[] scores = new float[vectorSize];
for (int j = 0; j < vectorSize; j++) {
wordIndexes[j] = in.readInt();
scores[j] = in.readFloat();
}
if (i % 10000 == 0) {
Log.info("%d completed", i);
}
distanceMap.put(sourceWordIndex, new _Distance(wordIndexes, scores));
}
return new DistanceList(vocabulary, distanceMap);
}
}
use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.
the class WordVectorLookup method loadFromText.
public static WordVectorLookup loadFromText(Path txtFile, boolean skipFirstLine) throws IOException {
List<String> lines = Files.readAllLines(txtFile);
// generate vocabulary
LmVocabulary.Builder builder = new LmVocabulary.Builder();
int lineCount = 0;
for (String line : lines) {
if (lineCount++ == 0 && skipFirstLine) {
// skip first line.
continue;
}
int index = line.indexOf(' ');
String word = line.substring(0, index);
builder.add(word);
}
LmVocabulary vocabulary = builder.generate();
UIntMap<Vector> vectors = new UIntMap<>(lines.size());
lineCount = 0;
for (String line : lines) {
if (lineCount++ == 0 && skipFirstLine) {
// skip first line.
continue;
}
line = line.trim();
int index = line.indexOf(' ');
String word = line.substring(0, index);
String floats = line.substring(index + 1);
float[] vector = FloatArrays.fromString(floats, " ");
int wordIndex = vocabulary.indexOf(word);
vectors.put(wordIndex, new Vector(wordIndex, vector));
}
return new WordVectorLookup(vocabulary, vectors);
}
use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.
the class SmoothLmTest method testProbWithBackoff.
@Test
public void testProbWithBackoff() throws IOException {
SmoothLm lm = getTinyLm();
LmVocabulary vocabulary = lm.getVocabulary();
int ahmet = vocabulary.indexOf("Ahmet");
int yemez = vocabulary.indexOf("yemez");
// p(yemez|ahmet) = p(yemez) + b(ahmet) if p(yemez|ahmet) does not exist.
double expected = -1.414973 + -0.316824;
Assert.assertEquals(expected, lm.getProbability(ahmet, yemez), 0.0001);
}
use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.
the class SmoothLmTest method testExplain.
@Test
public void testExplain() throws IOException {
SmoothLm lm = getTinyLm();
LmVocabulary vocabulary = lm.getVocabulary();
int[] is = { vocabulary.indexOf("<s>") };
System.out.println(lm.explain(is));
// <s> kedi
int[] is2 = vocabulary.toIndexes("<s>", "kedi");
System.out.println(lm.explain(is2));
// Ahmet dondurma yedi
int[] is3 = vocabulary.toIndexes("Ahmet", "dondurma", "yedi");
System.out.println(lm.explain(is3));
int[] is4 = vocabulary.toIndexes("Ahmet", "yemez");
System.out.println(lm.explain(is4));
int[] is5 = vocabulary.toIndexes("Ahmet", "yemez", "kırmızı");
System.out.println(lm.explain(is5));
}
use of zemberek.lm.LmVocabulary in project zemberek-nlp by ahmetaa.
the class WordDistances method readFromBinary.
public static List<WordDistances> readFromBinary(Path binFile, Path vocabFile) throws IOException {
LmVocabulary vocabulary = LmVocabulary.loadFromBinary(vocabFile.toFile());
try (DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(binFile.toFile()), 100000))) {
int wordSize = in.readInt();
int vectorSize = in.readInt();
List<WordDistances> distLists = new ArrayList<>(wordSize);
for (int i = 0; i < wordSize; i++) {
String s = vocabulary.getWord(in.readInt());
Distance[] distances = new Distance[vectorSize];
for (int j = 0; j < vectorSize; j++) {
String word = vocabulary.getWord(in.readInt());
Float f = in.readFloat();
distances[j] = new Distance(word, f);
}
if (i % 10000 == 0) {
Log.info("%d completed", i);
}
distLists.add(new WordDistances(s, distances));
}
return distLists;
}
}
Aggregations