Search in sources :

Example 16 with IntVector

use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.

the class FastText method predict.

List<ScoredItem<String>> predict(String line, int k) {
    IntVector words = new IntVector();
    IntVector labels = new IntVector();
    dict_.getLine(line, words, labels, model_.getRng());
    dict_.addWordNgramHashes(words, args_.wordNgrams);
    if (words.isempty()) {
        return Collections.emptyList();
    }
    Vector output = new Vector(dict_.nlabels());
    Vector hidden = model_.computeHidden(words.copyOf());
    List<Model.FloatIntPair> modelPredictions = model_.predict(k, hidden, output);
    List<ScoredItem<String>> result = new ArrayList<>(modelPredictions.size());
    for (Model.FloatIntPair pair : modelPredictions) {
        result.add(new ScoredItem<>(dict_.getLabel(pair.second), pair.first));
    }
    return result;
}
Also used : IntVector(zemberek.core.collections.IntVector) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList) IntVector(zemberek.core.collections.IntVector)

Example 17 with IntVector

use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.

the class Model method buildTree.

private void buildTree(long[] counts) {
    tree = new Node[2 * osz_ - 1];
    for (int i = 0; i < 2 * osz_ - 1; i++) {
        tree[i] = new Node();
        tree[i].parent = -1;
        tree[i].left = -1;
        tree[i].right = -1;
        tree[i].count = (long) 1e15;
        tree[i].binary = false;
    }
    for (int i = 0; i < osz_; i++) {
        tree[i].count = counts[i];
    }
    int leaf = osz_ - 1;
    int node = osz_;
    for (int i = osz_; i < 2 * osz_ - 1; i++) {
        int[] mini = new int[2];
        for (int j = 0; j < 2; j++) {
            if (leaf >= 0 && tree[leaf].count < tree[node].count) {
                mini[j] = leaf--;
            } else {
                mini[j] = node++;
            }
        }
        tree[i].left = mini[0];
        tree[i].right = mini[1];
        tree[i].count = tree[mini[0]].count + tree[mini[1]].count;
        tree[mini[0]].parent = i;
        tree[mini[1]].parent = i;
        tree[mini[1]].binary = true;
    }
    for (int i = 0; i < osz_; i++) {
        IntVector path = new IntVector();
        IntVector code = new IntVector();
        int j = i;
        while (tree[j].parent != -1) {
            path.add(tree[j].parent - osz_);
            code.add(tree[j].binary ? 1 : 0);
            j = tree[j].parent;
        }
        paths.add(path);
        codes.add(code);
    }
}
Also used : IntVector(zemberek.core.collections.IntVector)

Example 18 with IntVector

use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.

the class FastText method test.

public void test(Path in, int k, float threshold, Meter meter) throws IOException {
    String lineStr;
    BufferedReader reader = Files.newBufferedReader(in, StandardCharsets.UTF_8);
    while ((lineStr = reader.readLine()) != null) {
        IntVector words = new IntVector(), labels = new IntVector();
        dict_.getLine(lineStr, words, labels);
        if (labels.size() > 0 && words.size() > 0) {
            List<Model.FloatIntPair> modelPredictions = model_.predict(words.copyOf(), threshold, k);
            meter.log(labels, modelPredictions);
        }
    }
}
Also used : IntVector(zemberek.core.collections.IntVector) BufferedReader(java.io.BufferedReader)

Example 19 with IntVector

use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.

the class FastText method getSentenceVector.

Vector getSentenceVector(String s) {
    Vector svec = new Vector(args_.dim);
    if (args_.model == model_name.supervised) {
        IntVector line = new IntVector(), labels = new IntVector();
        dict_.getLine(s, line, labels);
        for (int i = 0; i < line.size(); i++) {
            addInputVector(svec, line.get(i));
        }
        if (!line.isempty()) {
            svec.mul(1.0f / line.size());
        }
    } else {
        String[] tokens = s.split("\\s+");
        int count = 0;
        for (String token : tokens) {
            if (token.length() == 0) {
                continue;
            }
            Vector vec = getWordVector(token);
            float norm = vec.norm();
            if (norm > 0) {
                vec.mul(1.0f / norm);
                svec.addVector(vec);
                count++;
            }
        }
        if (count > 0) {
            svec.mul(1.0f / count);
        }
    }
    return svec;
}
Also used : IntVector(zemberek.core.collections.IntVector) IntVector(zemberek.core.collections.IntVector)

Example 20 with IntVector

use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.

the class FastText method predict.

public List<ScoredItem<String>> predict(String line, int k, float threshold) {
    IntVector words = new IntVector();
    IntVector labels = new IntVector();
    dict_.getLine(line, words, labels);
    if (words.isempty()) {
        return Collections.emptyList();
    }
    List<Model.FloatIntPair> modelPredictions = model_.predict(words.copyOf(), threshold, k);
    List<ScoredItem<String>> result = new ArrayList<>(modelPredictions.size());
    for (Model.FloatIntPair pair : modelPredictions) {
        result.add(new ScoredItem<>(dict_.getLabel(pair.second), pair.first));
    }
    return result;
}
Also used : IntVector(zemberek.core.collections.IntVector) ScoredItem(zemberek.core.ScoredItem) ArrayList(java.util.ArrayList)

Aggregations

IntVector (zemberek.core.collections.IntVector)21 BufferedReader (java.io.BufferedReader)3 ArrayList (java.util.ArrayList)2 ScoredItem (zemberek.core.ScoredItem)2