Search in sources :

Example 11 with structures._SparseFeature

use of structures._SparseFeature in project IR_Base by Linda-sunshine.

the class MultiThreadedLMAnalyzer method estimateGlobalLM.

// Estimate a global language model.
// We traverse all review documents instead of using the global TF
public double[] estimateGlobalLM() {
    double[] lm = new double[getLMFeatureSize()];
    double sum = 0;
    for (_User u : m_users) {
        for (_Review r : u.getReviews()) {
            for (_SparseFeature fv : r.getLMSparse()) {
                lm[fv.getIndex()] += fv.getValue();
                sum += fv.getValue();
            }
        }
    }
    for (int i = 0; i < lm.length; i++) {
        lm[i] /= sum;
        if (lm[i] == 0)
            lm[i] = 0.0001;
    }
    return lm;
}
Also used : structures._Review(structures._Review) structures._User(structures._User) structures._SparseFeature(structures._SparseFeature)

Example 12 with structures._SparseFeature

use of structures._SparseFeature in project IR_Base by Linda-sunshine.

the class VctAnalyzer method LoadDoc.

@Override
public void LoadDoc(String filename) {
    try {
        BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));
        String line;
        String[] container, entry;
        _SparseFeature[] spVct;
        _Doc doc;
        int maxFvIndex = 0, index;
        while ((line = reader.readLine()) != null) {
            container = line.split(" ");
            if (// || Math.random() < 0.65
            container.length <= m_lengthThreshold)
                continue;
            doc = new _Doc(m_corpus.getSize(), null, Integer.valueOf(container[0]));
            if (!line.contains("#"))
                spVct = new _SparseFeature[container.length - 1];
            else
                // exclude the comment
                spVct = new _SparseFeature[container.length - 2];
            for (int i = 1; i < container.length; i++) {
                if (container[i].startsWith("#")) {
                    // parse the comment part for this review
                    entry = container[i].split("-");
                    doc.setItemID(entry[0].substring(1));
                    doc.setName(entry[1]);
                } else {
                    entry = container[i].split(":");
                    // the loaded feature index starts from 1
                    index = Integer.valueOf(entry[0]) - 1;
                    spVct[i - 1] = new _SparseFeature(index, Double.valueOf(entry[1]));
                    if (index > maxFvIndex)
                        maxFvIndex = index;
                }
            }
            doc.setSpVct(spVct);
            m_corpus.addDoc(doc);
            m_classMemberNo[doc.getYLabel()]++;
        }
        reader.close();
        reviseCV(maxFvIndex);
        System.out.format("Loading %d vector files with %d features from %s...\n", m_corpus.getSize(), m_featureNames.size(), filename);
    } catch (IOException e) {
        System.err.format("[Error]Failed to open file %s!!", filename);
    }
}
Also used : InputStreamReader(java.io.InputStreamReader) structures._Doc(structures._Doc) BufferedReader(java.io.BufferedReader) structures._SparseFeature(structures._SparseFeature) IOException(java.io.IOException) FileInputStream(java.io.FileInputStream)

Example 13 with structures._SparseFeature

use of structures._SparseFeature in project IR_Base by Linda-sunshine.

the class LinearSVMMetricLearning method createLinearFeature_diff.

// Calculate the new sample according to two documents.
// Since cross-product will be symmetric, we don't need to store the whole matrix
Feature[] createLinearFeature_diff(_Doc d1, _Doc d2) {
    _SparseFeature[] fv1 = d1.getProjectedFv(), fv2 = d2.getProjectedFv();
    if (fv1 == null || fv2 == null)
        return null;
    _SparseFeature[] diffVct = Utils.diffVector(fv1, fv2);
    Feature[] features = new Feature[diffVct.length * (diffVct.length + 1) / 2];
    int pi, pj, spIndex = 0;
    double value = 0;
    for (int i = 0; i < diffVct.length; i++) {
        pi = diffVct[i].getIndex();
        for (int j = 0; j < i; j++) {
            pj = diffVct[j].getIndex();
            // Currently, we use one dimension array to represent V*V features
            // this might be too small to count
            value = 2 * diffVct[i].getValue() * diffVct[j].getValue();
            features[spIndex++] = new FeatureNode(getIndex(pi, pj), value);
        }
        // this might be too small to count
        value = diffVct[i].getValue() * diffVct[i].getValue();
        features[spIndex++] = new FeatureNode(getIndex(pi, pi), value);
    }
    return features;
}
Also used : FeatureNode(Classifier.supervised.liblinear.FeatureNode) structures._SparseFeature(structures._SparseFeature) Feature(Classifier.supervised.liblinear.Feature) structures._SparseFeature(structures._SparseFeature)

Example 14 with structures._SparseFeature

use of structures._SparseFeature in project IR_Base by Linda-sunshine.

the class NaiveBayesEM method MStep.

void MStep(Collection<_Doc> trainSet, int iter) {
    super.init();
    for (_Doc doc : trainSet) {
        if (doc.getSourceType() == 2) {
            // labeled data
            int label = doc.getYLabel();
            m_pY[label]++;
            for (_SparseFeature sf : doc.getSparse()) m_Pxy[label][sf.getIndex()] += m_presence ? 1.0 : sf.getValue();
        } else if (iter > 0 && doc.getSourceType() == 1) {
            // unlabeled data
            double[] label = doc.m_sstat;
            for (int i = 0; i < m_classNo; i++) {
                m_pY[i] += label[i];
                for (_SparseFeature sf : doc.getSparse()) m_Pxy[i][sf.getIndex()] += (m_presence ? 1.0 : sf.getValue()) * label[i];
            }
        }
    }
    // normalization
    double sumY = Math.log(Utils.sumOfArray(m_pY) + m_deltaY * m_classNo);
    for (int i = 0; i < m_classNo; i++) {
        m_pY[i] = Math.log(m_pY[i] + m_deltaY) - sumY;
        double sumX = Math.log(Utils.sumOfArray(m_Pxy[i]) + m_featureSize * m_deltaXY);
        for (int j = 0; j < m_featureSize; j++) m_Pxy[i][j] = Math.log(m_deltaXY + m_Pxy[i][j]) - sumX;
    }
}
Also used : structures._Doc(structures._Doc) structures._SparseFeature(structures._SparseFeature)

Example 15 with structures._SparseFeature

use of structures._SparseFeature in project IR_Base by Linda-sunshine.

the class NaiveBayes method train.

// Train the data set.
@Override
public double train(Collection<_Doc> trainSet) {
    init();
    for (_Doc doc : trainSet) {
        int label = doc.getYLabel();
        m_pY[label]++;
        for (_SparseFeature sf : doc.getSparse()) m_Pxy[label][sf.getIndex()] += m_presence ? 1.0 : sf.getValue();
    }
    // normalization
    for (int i = 0; i < m_classNo; i++) {
        // up to a constant since normalization of this is not important
        m_pY[i] = Math.log(m_pY[i] + m_deltaY);
        double sum = Math.log(Utils.sumOfArray(m_Pxy[i]) + m_featureSize * m_deltaXY);
        for (int j = 0; j < m_featureSize; j++) m_Pxy[i][j] = Math.log(m_deltaXY + m_Pxy[i][j]) - sum;
    }
    // we should compute the log-likelihood
    return 0;
}
Also used : structures._Doc(structures._Doc) structures._SparseFeature(structures._SparseFeature)

Aggregations

structures._SparseFeature (structures._SparseFeature)94 structures._ChildDoc (structures._ChildDoc)14 structures._Doc (structures._Doc)14 structures._Review (structures._Review)14 HashMap (java.util.HashMap)7 structures._ParentDoc (structures._ParentDoc)7 structures._Stn (structures._Stn)7 Feature (Classifier.supervised.liblinear.Feature)6 FeatureNode (Classifier.supervised.liblinear.FeatureNode)6 structures._RankItem (structures._RankItem)5 File (java.io.File)3 PrintWriter (java.io.PrintWriter)3 Classifier.supervised.modelAdaptation._AdaptStruct (Classifier.supervised.modelAdaptation._AdaptStruct)2 FileNotFoundException (java.io.FileNotFoundException)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 Map (java.util.Map)2 Entry (java.util.Map.Entry)2 structures._ChildDoc4BaseWithPhi (structures._ChildDoc4BaseWithPhi)2 structures._HDPThetaStar (structures._HDPThetaStar)2