Search in sources :

Example 46 with structures._SparseFeature

use of structures._SparseFeature in project IR_Base by Linda-sunshine.

the class Analyzer method setFeatureValues.

// Give the option, which would be used as the method to calculate feature value and returned corpus, calculate the feature values.
public void setFeatureValues(String fValue, int norm) {
    // Get the collection of all the documents.
    ArrayList<_Doc> docs = m_corpus.getCollection();
    int N = m_isCVStatLoaded ? m_TotalDF : docs.size();
    if (fValue.equals("TFIDF")) {
        for (int i = 0; i < docs.size(); i++) {
            _Doc temp = docs.get(i);
            _SparseFeature[] sfs = temp.getSparse();
            double avgIDF = 0;
            for (_SparseFeature sf : sfs) {
                String featureName = m_featureNames.get(sf.getIndex());
                _stat stat = m_featureStat.get(featureName);
                sf.setTF(sf.getValue());
                // normalized TF
                double TF = sf.getValue() / temp.getTotalDocLength();
                double DF = Utils.sumOfArray(stat.getDF());
                double IDF = Math.log((N + 1) / DF);
                double TFIDF = TF * IDF;
                sf.setValue(TFIDF);
                avgIDF += IDF;
            }
            // compute average IDF
            temp.setAvgIDF(avgIDF / sfs.length);
        }
    } else if (fValue.equals("TFIDF-sublinear")) {
        for (int i = 0; i < docs.size(); i++) {
            _Doc temp = docs.get(i);
            _SparseFeature[] sfs = temp.getSparse();
            double avgIDF = 0;
            for (_SparseFeature sf : sfs) {
                String featureName = m_featureNames.get(sf.getIndex());
                _stat stat = m_featureStat.get(featureName);
                // sf.setTF(sf.getValue());
                // sublinear TF
                double TF = 1 + Math.log10(sf.getValue());
                double DF = Utils.sumOfArray(stat.getDF());
                double IDF = 1 + Math.log10(N / DF);
                double TFIDF = TF * IDF;
                sf.setValue(TFIDF);
                avgIDF += IDF;
            }
            // compute average IDF
            temp.setAvgIDF(avgIDF / sfs.length);
        }
    } else if (fValue.equals("BM25")) {
        // [1.2, 2]
        double k1 = 1.5;
        // (0, 1000]
        double b = 0.75;
        // Iterate all the documents to get the average document length.
        double navg = 0;
        for (int k = 0; k < N; k++) navg += docs.get(k).getTotalDocLength();
        navg /= N;
        for (int i = 0; i < docs.size(); i++) {
            _Doc temp = docs.get(i);
            _SparseFeature[] sfs = temp.getSparse();
            double n = temp.getTotalDocLength() / navg, avgIDF = 0;
            for (_SparseFeature sf : sfs) {
                String featureName = m_featureNames.get(sf.getIndex());
                _stat stat = m_featureStat.get(featureName);
                // sf.setTF(sf.getValue());
                double TF = sf.getValue();
                double DF = Utils.sumOfArray(stat.getDF());
                double IDF = Math.log((N - DF + 0.5) / (DF + 0.5));
                double BM25 = IDF * TF * (k1 + 1) / (k1 * (1 - b + b * n) + TF);
                sf.setValue(BM25);
                avgIDF += IDF;
            }
            // compute average IDF
            temp.setAvgIDF(avgIDF / sfs.length);
        }
    } else if (fValue.equals("PLN")) {
        // [0, 1]
        double s = 0.5;
        // Iterate all the documents to get the average document length.
        double navg = 0;
        for (int k = 0; k < N; k++) navg += docs.get(k).getTotalDocLength();
        navg /= N;
        for (int i = 0; i < docs.size(); i++) {
            _Doc temp = docs.get(i);
            _SparseFeature[] sfs = temp.getSparse();
            double n = temp.getTotalDocLength() / navg, avgIDF = 0;
            for (_SparseFeature sf : sfs) {
                String featureName = m_featureNames.get(sf.getIndex());
                _stat stat = m_featureStat.get(featureName);
                // sf.setTF(sf.getValue());
                double TF = sf.getValue();
                double DF = Utils.sumOfArray(stat.getDF());
                double IDF = Math.log((N + 1) / DF);
                double PLN = (1 + Math.log(1 + Math.log(TF)) / (1 - s + s * n)) * IDF;
                sf.setValue(PLN);
                avgIDF += IDF;
            }
            // compute average IDF
            temp.setAvgIDF(avgIDF / sfs.length);
        }
    } else {
        System.out.println("No feature value is set, keep the raw count of every feature in setFeatureValues().");
        // the original feature is raw TF
        for (int i = 0; i < docs.size(); i++) {
            _Doc temp = docs.get(i);
            _SparseFeature[] sfs = temp.getSparse();
            double avgIDF = 0;
            for (_SparseFeature sf : sfs) {
                String featureName = m_featureNames.get(sf.getIndex());
                _stat stat = m_featureStat.get(featureName);
                // sf.setTF(sf.getValue());
                double DF = Utils.sumOfArray(stat.getDF());
                double IDF = Math.log((N + 1) / DF);
                avgIDF += IDF;
            }
            // compute average IDF
            temp.setAvgIDF(avgIDF / sfs.length);
        }
    }
    // Collections.sort(m_corpus.getCollection());
    if (norm == 1) {
        for (_Doc d : docs) Utils.L1Normalization(d.getSparse());
    } else if (norm == 2) {
        for (_Doc d : docs) Utils.L2Normalization(d.getSparse());
    } else
        System.out.println("No normalizaiton is adopted here or wrong parameters in setFeatureValues()!");
    System.out.format("Text feature generated for %d documents...\n", m_corpus.getSize());
}
Also used : structures._stat(structures._stat) structures._Doc(structures._Doc) structures._SparseFeature(structures._SparseFeature)

Example 47 with structures._SparseFeature

use of structures._SparseFeature in project IR_Base by Linda-sunshine.

the class MTCLRWithMMB method gradientByFunc.

@Override
protected void gradientByFunc(_AdaptStruct u, _Doc review, double weight, double[] g) {
    _Review r = (_Review) review;
    // feature index
    int n;
    int cIndex = r.getHDPThetaStar().getIndex();
    if (cIndex < 0 || cIndex >= m_kBar)
        System.err.println("Error,cannot find the theta star!");
    int offset = m_dim * cIndex;
    int offsetSup = m_dim * m_kBar;
    double delta = weight * (r.getYLabel() - logit(r.getSparse(), r));
    // Bias term.
    // x0=1, each cluster.
    g[offset] -= delta;
    // super model.
    g[offsetSup] -= m_q * delta;
    // Traverse all the feature dimension to calculate the gradient.
    for (_SparseFeature fv : review.getSparse()) {
        n = fv.getIndex() + 1;
        // cluster model.
        g[offset + n] -= delta * fv.getValue();
        // super model.
        g[offsetSup + n] -= delta * fv.getValue() * m_q;
    }
}
Also used : structures._Review(structures._Review) structures._SparseFeature(structures._SparseFeature)

Example 48 with structures._SparseFeature

use of structures._SparseFeature in project IR_Base by Linda-sunshine.

the class MTCLinAdaptWithMMB method logit.

// Logit function is different from the father class.
@Override
protected double logit(_SparseFeature[] fvs, _Review r) {
    int k, n;
    double[] Au = r.getHDPThetaStar().getModel();
    // Bias term: w_s0*a0+b0.
    double sum = Au[0] * getSupWeights(0) + Au[m_dim];
    for (_SparseFeature fv : fvs) {
        n = fv.getIndex() + 1;
        k = m_featureGroupMap[n];
        sum += (Au[k] * getSupWeights(n) + Au[m_dim + k]) * fv.getValue();
    }
    return Utils.logistic(sum);
}
Also used : structures._SparseFeature(structures._SparseFeature)

Example 49 with structures._SparseFeature

use of structures._SparseFeature in project IR_Base by Linda-sunshine.

the class _MMBAdaptStruct method evaluate.

@Override
public double evaluate(_Doc doc) {
    _Review r = (_Review) doc;
    double prob = 0, sum = 0;
    double[] probs = r.getCluPosterior();
    int n, m, k;
    // not adaptation based
    if (m_dim == 0) {
        for (k = 0; k < probs.length; k++) {
            // need to be fixed: here we assumed binary classification
            sum = Utils.dotProduct(CLRWithMMB.m_hdpThetaStars[k].getModel(), doc.getSparse(), 0);
            if (MTCLRWithMMB.m_supWeights != null && CLRWithDP.m_q != 0)
                sum += CLRWithDP.m_q * Utils.dotProduct(MTCLRWithMMB.m_supWeights, doc.getSparse(), 0);
            // to maintain numerical precision, compute the expectation in log space as well
            if (k == 0)
                prob = probs[k] + Math.log(Utils.logistic(sum));
            else
                prob = Utils.logSum(prob, probs[k] + Math.log(Utils.logistic(sum)));
        }
    } else {
        double[] As;
        for (k = 0; k < probs.length; k++) {
            As = CLRWithMMB.m_hdpThetaStars[k].getModel();
            // Bias term: w_s0*a0+b0.
            sum = As[0] * CLinAdaptWithMMB.m_supWeights[0] + As[m_dim];
            for (_SparseFeature fv : doc.getSparse()) {
                n = fv.getIndex() + 1;
                m = m_featureGroupMap[n];
                sum += (As[m] * CLinAdaptWithMMB.m_supWeights[n] + As[m_dim + m]) * fv.getValue();
            }
            // to maintain numerical precision, compute the expectation in log space as well
            if (k == 0)
                prob = probs[k] + Math.log(Utils.logistic(sum));
            else
                prob = Utils.logSum(prob, probs[k] + Math.log(Utils.logistic(sum)));
        }
    }
    // accumulate the prediction results during sampling procedure
    doc.m_pCount++;
    // >0.5?1:0;
    doc.m_prob += Math.exp(prob);
    return prob;
}
Also used : structures._Review(structures._Review) structures._SparseFeature(structures._SparseFeature)

Example 50 with structures._SparseFeature

use of structures._SparseFeature in project IR_Base by Linda-sunshine.

the class MTRegLR method gradientByFunc.

@Override
protected void gradientByFunc(_AdaptStruct user, _Doc review, double weight) {
    int n, uOffset, gOffset;
    uOffset = (m_featureSize + 1) * user.getId();
    gOffset = (m_featureSize + 1) * m_userList.size();
    double delta = weight * (review.getYLabel() - logit(review.getSparse(), user));
    if (m_LNormFlag)
        delta /= getAdaptationSize(user);
    // Bias term.
    // a[0] = w0*x0; x0=1
    m_g[uOffset] -= delta;
    // offset for the global part.
    m_g[gOffset] -= m_u * delta;
    // Traverse all the feature dimension to calculate the gradient.
    for (_SparseFeature fv : review.getSparse()) {
        n = fv.getIndex() + 1;
        // User part.
        m_g[uOffset + n] -= delta * fv.getValue();
        // Global part.
        m_g[gOffset + n] -= delta * m_u * fv.getValue();
    }
}
Also used : structures._SparseFeature(structures._SparseFeature)

Aggregations

structures._SparseFeature (structures._SparseFeature)94 structures._ChildDoc (structures._ChildDoc)14 structures._Doc (structures._Doc)14 structures._Review (structures._Review)14 HashMap (java.util.HashMap)7 structures._ParentDoc (structures._ParentDoc)7 structures._Stn (structures._Stn)7 Feature (Classifier.supervised.liblinear.Feature)6 FeatureNode (Classifier.supervised.liblinear.FeatureNode)6 structures._RankItem (structures._RankItem)5 File (java.io.File)3 PrintWriter (java.io.PrintWriter)3 Classifier.supervised.modelAdaptation._AdaptStruct (Classifier.supervised.modelAdaptation._AdaptStruct)2 FileNotFoundException (java.io.FileNotFoundException)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Map (java.util.Map)2 Entry (java.util.Map.Entry)2 structures._ChildDoc4BaseWithPhi (structures._ChildDoc4BaseWithPhi)2 structures._HDPThetaStar (structures._HDPThetaStar)2