Search in sources :

Example 6 with IntVector

use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.

the class FastText method textVectors.

Vector textVectors(List<String> paragraph) {
    Vector vec = new Vector(args_.dim);
    for (String s : paragraph) {
        IntVector line = new IntVector();
        dict_.getLine(s, line, model_.getRng());
        if (line.size() == 0) {
            continue;
        }
        dict_.addWordNgramHashes(line, args_.wordNgrams);
        for (int i : line.copyOf()) {
            vec.addRow(model_.wi_, i);
        }
        vec.mul((float) (1.0 / line.size()));
    }
    return vec;
}
Also used : IntVector(zemberek.core.collections.IntVector) IntVector(zemberek.core.collections.IntVector)

Example 7 with IntVector

use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.

the class FastText method test.

public EvaluationResult test(Path in, int k, float threshold) throws IOException {
    int nexamples = 0, nlabels = 0;
    float precision = 0.0f;
    String lineStr;
    BufferedReader reader = Files.newBufferedReader(in, StandardCharsets.UTF_8);
    while ((lineStr = reader.readLine()) != null) {
        IntVector words = new IntVector(), labels = new IntVector();
        dict_.getLine(lineStr, words, labels);
        if (labels.size() > 0 && words.size() > 0) {
            List<Model.FloatIntPair> modelPredictions = model_.predict(words.copyOf(), threshold, k);
            for (Model.FloatIntPair pair : modelPredictions) {
                if (labels.contains(pair.second)) {
                    precision += 1.0f;
                }
            }
            nexamples++;
            nlabels += labels.size();
        }
    }
    return new EvaluationResult(precision / (k * nexamples), precision / nlabels, k, nexamples);
}
Also used : IntVector(zemberek.core.collections.IntVector) BufferedReader(java.io.BufferedReader)

Example 8 with IntVector

use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.

the class Model method hierarchicalSoftmax.

private float hierarchicalSoftmax(Vector grad_, Vector hidden, int target, float lr) {
    float loss = 0.0f;
    IntVector binaryCode = hierarchicalSoftmax.codes.get(target);
    IntVector pathToRoot = hierarchicalSoftmax.paths.get(target);
    for (int i = 0; i < pathToRoot.size(); i++) {
        loss += binaryLogistic(grad_, hidden, pathToRoot.get(i), binaryCode.get(i) == 1, lr);
    }
    return loss;
}
Also used : IntVector(zemberek.core.collections.IntVector)

Example 9 with IntVector

use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.

the class Dictionary method getLine.

int getLine(String line, IntVector words, IntVector labels) {
    IntVector wordHashes = new IntVector();
    int ntokens = 0;
    List<String> tokens = tokenizer.splitToList(line);
    for (String token : tokens) {
        if (token.startsWith("#")) {
            continue;
        }
        int h = hash(token);
        int wid = getId(token, h);
        int type = wid < 0 ? getType(token) : getType(wid);
        ntokens++;
        if (type == TYPE_WORD) {
            addSubwords(words, token, wid);
            wordHashes.add(h);
        } else if (type == TYPE_LABEL) {
            labels.add(wid - nwords_);
        }
        if (token.equals(EOS)) {
            break;
        }
    }
    addWordNgramHashes(words, wordHashes, args_.wordNgrams);
    return ntokens;
}
Also used : IntVector(zemberek.core.collections.IntVector)

Example 10 with IntVector

use of zemberek.core.collections.IntVector in project zemberek-nlp by ahmetaa.

the class Dictionary method prune.

int[] prune(int[] idx) {
    IntVector words = new IntVector();
    IntVector ngrams = new IntVector();
    for (int i : idx) {
        if (i < nwords_) {
            words.add(i);
        } else {
            ngrams.add(i);
        }
    }
    words.sort();
    IntVector newIndexes = new IntVector(words.copyOf());
    if (ngrams.size() > 0) {
        int j = 0;
        for (int k = 0; k < ngrams.size(); k++) {
            int ngram = ngrams.get(k);
            pruneidx_.put(ngram - nwords_, j);
            j++;
        }
        newIndexes.addAll(ngrams);
    }
    pruneidx_size_ = pruneidx_.size();
    Arrays.fill(word2int_, -1);
    int j = 0;
    for (int i = 0; i < words_.size(); i++) {
        if (getType(i) == TYPE_LABEL || (j < words.size() && words.get(j) == i)) {
            words_.set(j, words_.get(i));
            word2int_[find(words_.get(j).word)] = j;
            j++;
        }
    }
    nwords_ = words.size();
    size_ = nwords_ + nlabels_;
    words_ = new ArrayList<>(words_.subList(0, size_));
    initNgrams();
    return newIndexes.copyOf();
}
Also used : IntVector(zemberek.core.collections.IntVector)

Aggregations

IntVector (zemberek.core.collections.IntVector)21 BufferedReader (java.io.BufferedReader)3 ArrayList (java.util.ArrayList)2 ScoredItem (zemberek.core.ScoredItem)2