use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class FastText method textVectors.
Vector textVectors(List<String> paragraph) {
Vector vec = new Vector(args_.dim);
for (String s : paragraph) {
IntVector line = new IntVector();
dict_.getLine(s, line, model_.getRng());
if (line.size() == 0) {
continue;
}
dict_.addWordNgramHashes(line, args_.wordNgrams);
for (int i : line.copyOf()) {
vec.addRow(model_.wi_, i);
}
vec.mul((float) (1.0 / line.size()));
}
return vec;
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class FastText method test.
public EvaluationResult test(Path in, int k, float threshold) throws IOException {
int nexamples = 0, nlabels = 0;
float precision = 0.0f;
String lineStr;
BufferedReader reader = Files.newBufferedReader(in, StandardCharsets.UTF_8);
while ((lineStr = reader.readLine()) != null) {
IntVector words = new IntVector(), labels = new IntVector();
dict_.getLine(lineStr, words, labels);
if (labels.size() > 0 && words.size() > 0) {
List<Model.FloatIntPair> modelPredictions = model_.predict(words.copyOf(), threshold, k);
for (Model.FloatIntPair pair : modelPredictions) {
if (labels.contains(pair.second)) {
precision += 1.0f;
}
}
nexamples++;
nlabels += labels.size();
}
}
return new EvaluationResult(precision / (k * nexamples), precision / nlabels, k, nexamples);
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class Model method hierarchicalSoftmax.
private float hierarchicalSoftmax(Vector grad_, Vector hidden, int target, float lr) {
float loss = 0.0f;
IntVector binaryCode = hierarchicalSoftmax.codes.get(target);
IntVector pathToRoot = hierarchicalSoftmax.paths.get(target);
for (int i = 0; i < pathToRoot.size(); i++) {
loss += binaryLogistic(grad_, hidden, pathToRoot.get(i), binaryCode.get(i) == 1, lr);
}
return loss;
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class Dictionary method getLine.
int getLine(String line, IntVector words, IntVector labels) {
IntVector wordHashes = new IntVector();
int ntokens = 0;
List<String> tokens = tokenizer.splitToList(line);
for (String token : tokens) {
if (token.startsWith("#")) {
continue;
}
int h = hash(token);
int wid = getId(token, h);
int type = wid < 0 ? getType(token) : getType(wid);
ntokens++;
if (type == TYPE_WORD) {
addSubwords(words, token, wid);
wordHashes.add(h);
} else if (type == TYPE_LABEL) {
labels.add(wid - nwords_);
}
if (token.equals(EOS)) {
break;
}
}
addWordNgramHashes(words, wordHashes, args_.wordNgrams);
return ntokens;
}
use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.
the class Dictionary method prune.
int[] prune(int[] idx) {
IntVector words = new IntVector();
IntVector ngrams = new IntVector();
for (int i : idx) {
if (i < nwords_) {
words.add(i);
} else {
ngrams.add(i);
}
}
words.sort();
IntVector newIndexes = new IntVector(words.copyOf());
if (ngrams.size() > 0) {
int j = 0;
for (int k = 0; k < ngrams.size(); k++) {
int ngram = ngrams.get(k);
pruneidx_.put(ngram - nwords_, j);
j++;
}
newIndexes.addAll(ngrams);
}
pruneidx_size_ = pruneidx_.size();
Arrays.fill(word2int_, -1);
int j = 0;
for (int i = 0; i < words_.size(); i++) {
if (getType(i) == TYPE_LABEL || (j < words.size() && words.get(j) == i)) {
words_.set(j, words_.get(i));
word2int_[find(words_.get(j).word)] = j;
j++;
}
}
nwords_ = words.size();
size_ = nwords_ + nlabels_;
words_ = new ArrayList<>(words_.subList(0, size_));
initNgrams();
return newIndexes.copyOf();
}
Aggregations