use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class Dictionary method computeSubWords.
private int[] computeSubWords(String word, int wordId) {
int[] hashes = args_.subWordHashProvider.getHashes(word, wordId);
IntVector k = new IntVector();
for (int hash : hashes) {
pushHash(k, hash % args_.bucket);
}
return k.copyOf();
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class ProductQuantizer method train.
void train(int n, float[] x) {
if (n < ksub_) {
throw new IllegalArgumentException("Matrix too small for quantization, must have > 256 rows. But it is " + n);
}
IntVector perm = new IntVector(new int[n]);
for (int i = 0; i < n; i++) {
perm.safeSet(i, i);
}
int d = dsub_;
int np = Math.min(n, max_points_);
float[] xslice = new float[np * dsub_];
for (int m = 0; m < nsubq_; m++) {
if (m == nsubq_ - 1) {
d = lastdsub_;
}
if (np != n) {
perm.shuffle(rng);
}
for (int j = 0; j < np; j++) {
System.arraycopy(x, perm.get(j) * dim_ + m * dsub_, xslice, j * d, d);
}
kmeans(new FArray(xslice), get_centroids(m, (byte) 0), np, d);
}
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class FastText method textVector.
Vector textVector(String s) {
Vector vec = new Vector(args_.dim);
IntVector line = new IntVector(), labels = new IntVector();
dict_.getLine(s, line, labels, model_.getRng());
dict_.addWordNgramHashes(line, args_.wordNgrams);
if (line.size() == 0) {
return vec;
}
for (int i : line.copyOf()) {
vec.addRow(model_.wi_, i);
}
vec.mul((float) (1.0 / line.size()));
return vec;
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class FastText method textVectors.
Vector textVectors(List<String> paragraph) {
Vector vec = new Vector(args_.dim);
for (String s : paragraph) {
IntVector line = new IntVector(), labels = new IntVector();
dict_.getLine(s, line, labels, model_.getRng());
if (line.size() == 0) {
continue;
}
dict_.addWordNgramHashes(line, args_.wordNgrams);
for (int i : line.copyOf()) {
vec.addRow(model_.wi_, i);
}
vec.mul((float) (1.0 / line.size()));
}
return vec;
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class FastText method test.
void test(Path in, int k) throws IOException {
int nexamples = 0, nlabels = 0;
double precision = 0.0;
String lineStr;
BufferedReader reader = Files.newBufferedReader(in, StandardCharsets.UTF_8);
while ((lineStr = reader.readLine()) != null) {
IntVector line = new IntVector(), labels = new IntVector();
dict_.getLine(lineStr, line, labels, model_.getRng());
dict_.addWordNgramHashes(line, args_.wordNgrams);
if (labels.size() > 0 && line.size() > 0) {
List<Model.FloatIntPair> modelPredictions = model_.predict(line.copyOf(), k);
for (Model.FloatIntPair pair : modelPredictions) {
if (labels.contains(pair.second)) {
precision += 1.0;
}
}
nexamples++;
nlabels += labels.size();
}
}
Log.info("P@%d: %.3f R@%d: %.3f Number of examples = %d", k, precision / (k * nexamples), k, precision / nlabels, nexamples);
}
Aggregations