use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class FastText method predict.
List<ScoredItem<String>> predict(String line, int k) {
IntVector words = new IntVector();
IntVector labels = new IntVector();
dict_.getLine(line, words, labels, model_.getRng());
dict_.addWordNgramHashes(words, args_.wordNgrams);
if (words.isempty()) {
return Collections.emptyList();
}
Vector output = new Vector(dict_.nlabels());
Vector hidden = model_.computeHidden(words.copyOf());
List<Model.FloatIntPair> modelPredictions = model_.predict(k, hidden, output);
List<ScoredItem<String>> result = new ArrayList<>(modelPredictions.size());
for (Model.FloatIntPair pair : modelPredictions) {
result.add(new ScoredItem<>(dict_.getLabel(pair.second), pair.first));
}
return result;
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class Model method buildTree.
private void buildTree(long[] counts) {
tree = new Node[2 * osz_ - 1];
for (int i = 0; i < 2 * osz_ - 1; i++) {
tree[i] = new Node();
tree[i].parent = -1;
tree[i].left = -1;
tree[i].right = -1;
tree[i].count = (long) 1e15;
tree[i].binary = false;
}
for (int i = 0; i < osz_; i++) {
tree[i].count = counts[i];
}
int leaf = osz_ - 1;
int node = osz_;
for (int i = osz_; i < 2 * osz_ - 1; i++) {
int[] mini = new int[2];
for (int j = 0; j < 2; j++) {
if (leaf >= 0 && tree[leaf].count < tree[node].count) {
mini[j] = leaf--;
} else {
mini[j] = node++;
}
}
tree[i].left = mini[0];
tree[i].right = mini[1];
tree[i].count = tree[mini[0]].count + tree[mini[1]].count;
tree[mini[0]].parent = i;
tree[mini[1]].parent = i;
tree[mini[1]].binary = true;
}
for (int i = 0; i < osz_; i++) {
IntVector path = new IntVector();
IntVector code = new IntVector();
int j = i;
while (tree[j].parent != -1) {
path.add(tree[j].parent - osz_);
code.add(tree[j].binary ? 1 : 0);
j = tree[j].parent;
}
paths.add(path);
codes.add(code);
}
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class FastText method test.
public void test(Path in, int k, float threshold, Meter meter) throws IOException {
String lineStr;
BufferedReader reader = Files.newBufferedReader(in, StandardCharsets.UTF_8);
while ((lineStr = reader.readLine()) != null) {
IntVector words = new IntVector(), labels = new IntVector();
dict_.getLine(lineStr, words, labels);
if (labels.size() > 0 && words.size() > 0) {
List<Model.FloatIntPair> modelPredictions = model_.predict(words.copyOf(), threshold, k);
meter.log(labels, modelPredictions);
}
}
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class FastText method getSentenceVector.
Vector getSentenceVector(String s) {
Vector svec = new Vector(args_.dim);
if (args_.model == model_name.supervised) {
IntVector line = new IntVector(), labels = new IntVector();
dict_.getLine(s, line, labels);
for (int i = 0; i < line.size(); i++) {
addInputVector(svec, line.get(i));
}
if (!line.isempty()) {
svec.mul(1.0f / line.size());
}
} else {
String[] tokens = s.split("\\s+");
int count = 0;
for (String token : tokens) {
if (token.length() == 0) {
continue;
}
Vector vec = getWordVector(token);
float norm = vec.norm();
if (norm > 0) {
vec.mul(1.0f / norm);
svec.addVector(vec);
count++;
}
}
if (count > 0) {
svec.mul(1.0f / count);
}
}
return svec;
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class FastText method predict.
public List<ScoredItem<String>> predict(String line, int k, float threshold) {
IntVector words = new IntVector();
IntVector labels = new IntVector();
dict_.getLine(line, words, labels);
if (words.isempty()) {
return Collections.emptyList();
}
List<Model.FloatIntPair> modelPredictions = model_.predict(words.copyOf(), threshold, k);
List<ScoredItem<String>> result = new ArrayList<>(modelPredictions.size());
for (Model.FloatIntPair pair : modelPredictions) {
result.add(new ScoredItem<>(dict_.getLabel(pair.second), pair.first));
}
return result;
}
Aggregations