use of zemberek.core.collections.UIntMap in project zemberek-nlp by ahmetaa.
the class DistanceList method readFromBinary.
public static DistanceList readFromBinary(Path binFile, Path vocabFile) throws IOException {
LmVocabulary vocabulary = LmVocabulary.loadFromBinary(vocabFile.toFile());
try (DataInputStream in = new DataInputStream(new BufferedInputStream(new FileInputStream(binFile.toFile()), 100000))) {
int wordSize = in.readInt();
int vectorSize = in.readInt();
UIntMap<_Distance> distanceMap = new UIntMap<>(wordSize * 2);
for (int i = 0; i < wordSize; i++) {
int sourceWordIndex = in.readInt();
int[] wordIndexes = new int[vectorSize];
float[] scores = new float[vectorSize];
for (int j = 0; j < vectorSize; j++) {
wordIndexes[j] = in.readInt();
scores[j] = in.readFloat();
}
if (i % 10000 == 0) {
Log.info("%d completed", i);
}
distanceMap.put(sourceWordIndex, new _Distance(wordIndexes, scores));
}
return new DistanceList(vocabulary, distanceMap);
}
}
use of zemberek.core.collections.UIntMap in project zemberek-nlp by ahmetaa.
the class WordVectorLookup method loadFromText.
public static WordVectorLookup loadFromText(Path txtFile, boolean skipFirstLine) throws IOException {
List<String> lines = Files.readAllLines(txtFile);
// generate vocabulary
LmVocabulary.Builder builder = new LmVocabulary.Builder();
int lineCount = 0;
for (String line : lines) {
if (lineCount++ == 0 && skipFirstLine) {
// skip first line.
continue;
}
int index = line.indexOf(' ');
String word = line.substring(0, index);
builder.add(word);
}
LmVocabulary vocabulary = builder.generate();
UIntMap<Vector> vectors = new UIntMap<>(lines.size());
lineCount = 0;
for (String line : lines) {
if (lineCount++ == 0 && skipFirstLine) {
// skip first line.
continue;
}
line = line.trim();
int index = line.indexOf(' ');
String word = line.substring(0, index);
String floats = line.substring(index + 1);
float[] vector = FloatArrays.fromString(floats, " ");
int wordIndex = vocabulary.indexOf(word);
vectors.put(wordIndex, new Vector(wordIndex, vector));
}
return new WordVectorLookup(vocabulary, vectors);
}
Aggregations