Search in sources :

Example 1 with structures._stat

use of structures._stat in project IR_Base by Linda-sunshine.

the class AspectAnalyzer method expandKeywordsByChi.

boolean expandKeywordsByChi(double ratio) {
    int selID = -1, aspectSize = m_aspects.size(), N = Utils.sumOfArray(m_aspectDist), DF;
    double maxChi, chiV;
    int[] DFarray;
    _Aspect aspect;
    Iterator<Map.Entry<String, _stat>> it = m_featureStat.entrySet().iterator();
    while (it.hasNext()) {
        // set aspect assignment for each word
        Map.Entry<String, _stat> entry = it.next();
        _stat temp = entry.getValue();
        DFarray = temp.getDF();
        DF = Utils.sumOfArray(DFarray);
        maxChi = 0.0;
        selID = -1;
        for (int i = 0; i < aspectSize; i++) {
            chiV = Utils.ChiSquare(N, DF, DFarray[i], m_aspectDist[i]);
            if (chiV > ratio * maxChi) {
                maxChi = chiV;
                selID = i;
            }
        }
        if (selID > -1) {
            // expand candidate keyword list in the selected aspect
            aspect = m_aspects.get(selID);
            aspect.addCandidateKeyword(m_featureNameIndex.get(entry.getKey()), maxChi);
        }
    }
    boolean extended = false;
    for (int i = 0; i < aspectSize; i++) {
        // expand each aspect accordingly
        aspect = m_aspects.get(i);
        extended |= aspect.expandKeywords();
    }
    return extended;
}
Also used : structures._stat(structures._stat) HashMap(java.util.HashMap) Map(java.util.Map)

Example 2 with structures._stat

use of structures._stat in project IR_Base by Linda-sunshine.

the class FeatureSelector method IG.

// Feature Selection -- IG.
public void IG(HashMap<String, _stat> featureStat, int[] classMemberNo) {
    m_selectedFeatures.clear();
    double classMemberSum = Utils.sumOfArray(classMemberNo);
    // I
    double[] PrCi = new double[classMemberNo.length];
    // II
    double[] PrCit = new double[classMemberNo.length];
    // III
    double[] PrCitNot = new double[classMemberNo.length];
    double Prt = 0, PrtNot = 0;
    // IG
    double Gt = 0;
    double PrCiSum = 0, PrCitSum = 0, PrCitNotSum = 0;
    // - $sigma$PrCi * log PrCi
    for (int i = 0; i < classMemberNo.length; i++) {
        PrCi[i] = classMemberNo[i] / classMemberSum;
        if (PrCi[i] != 0) {
            PrCiSum -= PrCi[i] * Math.log(PrCi[i]);
        }
    }
    for (String f : featureStat.keySet()) {
        // Filter the features which have smaller DFs.
        int sumDF = Utils.sumOfArray(featureStat.get(f).getDF());
        if (sumDF > m_minDF && sumDF < m_maxDF) {
            _stat temp = featureStat.get(f);
            Prt = Utils.sumOfArray(temp.getDF()) / classMemberSum;
            PrtNot = 1 - Prt;
            PrCitSum = 0;
            PrCitNotSum = 0;
            for (int i = 0; i < classMemberNo.length; i++) {
                PrCit[i] = ((double) temp.getDF()[i] / classMemberNo[i]) * PrCi[i] / Prt;
                PrCitNot[i] = ((double) (classMemberNo[i] - temp.getDF()[i]) / classMemberNo[i]) * PrCi[i] / PrtNot;
                if (PrCit[i] != 0) {
                    PrCitSum += PrCit[i] * Math.log(PrCit[i]);
                }
                if (PrCitNot[i] != 0) {
                    PrCitNotSum += PrCitNot[i] * Math.log(PrCi[i]);
                }
            }
            Gt = PrCiSum + PrCitSum + PrCitNotSum;
            m_selectedFeatures.add(new _RankItem(f, Gt));
        }
    }
}
Also used : structures._stat(structures._stat) structures._RankItem(structures._RankItem)

Example 3 with structures._stat

use of structures._stat in project IR_Base by Linda-sunshine.

the class FeatureSelector method MI.

// Feature Selection -- MI.
public void MI(HashMap<String, _stat> featureStat, int[] classMemberNo) {
    m_selectedFeatures.clear();
    double[] PrCi = new double[classMemberNo.length];
    double[] ItCi = new double[classMemberNo.length];
    double N = Utils.sumOfArray(classMemberNo);
    double Iavg = 0;
    for (int i = 0; i < classMemberNo.length; i++) PrCi[i] = classMemberNo[i] / N;
    for (String f : featureStat.keySet()) {
        // Filter the features which have smaller DFs.
        int sumDF = Utils.sumOfArray(featureStat.get(f).getDF());
        if (sumDF > m_minDF && sumDF < m_maxDF) {
            Iavg = 0;
            for (int i = 0; i < classMemberNo.length; i++) {
                _stat temp = featureStat.get(f);
                double A = temp.getDF()[i];
                ItCi[i] = Math.log(A * N / classMemberNo[i] * Utils.sumOfArray(temp.getDF()));
                Iavg += ItCi[i] * PrCi[i];
            }
            m_selectedFeatures.add(new _RankItem(f, Iavg));
        }
    }
}
Also used : structures._stat(structures._stat) structures._RankItem(structures._RankItem)

Example 4 with structures._stat

use of structures._stat in project IR_Base by Linda-sunshine.

the class Analyzer method setFeatureValues.

// Give the option, which would be used as the method to calculate feature value and returned corpus, calculate the feature values.
public void setFeatureValues(String fValue, int norm) {
    // Get the collection of all the documents.
    ArrayList<_Doc> docs = m_corpus.getCollection();
    int N = m_isCVStatLoaded ? m_TotalDF : docs.size();
    if (fValue.equals("TFIDF")) {
        for (int i = 0; i < docs.size(); i++) {
            _Doc temp = docs.get(i);
            _SparseFeature[] sfs = temp.getSparse();
            double avgIDF = 0;
            for (_SparseFeature sf : sfs) {
                String featureName = m_featureNames.get(sf.getIndex());
                _stat stat = m_featureStat.get(featureName);
                sf.setTF(sf.getValue());
                // normalized TF
                double TF = sf.getValue() / temp.getTotalDocLength();
                double DF = Utils.sumOfArray(stat.getDF());
                double IDF = Math.log((N + 1) / DF);
                double TFIDF = TF * IDF;
                sf.setValue(TFIDF);
                avgIDF += IDF;
            }
            // compute average IDF
            temp.setAvgIDF(avgIDF / sfs.length);
        }
    } else if (fValue.equals("TFIDF-sublinear")) {
        for (int i = 0; i < docs.size(); i++) {
            _Doc temp = docs.get(i);
            _SparseFeature[] sfs = temp.getSparse();
            double avgIDF = 0;
            for (_SparseFeature sf : sfs) {
                String featureName = m_featureNames.get(sf.getIndex());
                _stat stat = m_featureStat.get(featureName);
                // sf.setTF(sf.getValue());
                // sublinear TF
                double TF = 1 + Math.log10(sf.getValue());
                double DF = Utils.sumOfArray(stat.getDF());
                double IDF = 1 + Math.log10(N / DF);
                double TFIDF = TF * IDF;
                sf.setValue(TFIDF);
                avgIDF += IDF;
            }
            // compute average IDF
            temp.setAvgIDF(avgIDF / sfs.length);
        }
    } else if (fValue.equals("BM25")) {
        // [1.2, 2]
        double k1 = 1.5;
        // (0, 1000]
        double b = 0.75;
        // Iterate all the documents to get the average document length.
        double navg = 0;
        for (int k = 0; k < N; k++) navg += docs.get(k).getTotalDocLength();
        navg /= N;
        for (int i = 0; i < docs.size(); i++) {
            _Doc temp = docs.get(i);
            _SparseFeature[] sfs = temp.getSparse();
            double n = temp.getTotalDocLength() / navg, avgIDF = 0;
            for (_SparseFeature sf : sfs) {
                String featureName = m_featureNames.get(sf.getIndex());
                _stat stat = m_featureStat.get(featureName);
                // sf.setTF(sf.getValue());
                double TF = sf.getValue();
                double DF = Utils.sumOfArray(stat.getDF());
                double IDF = Math.log((N - DF + 0.5) / (DF + 0.5));
                double BM25 = IDF * TF * (k1 + 1) / (k1 * (1 - b + b * n) + TF);
                sf.setValue(BM25);
                avgIDF += IDF;
            }
            // compute average IDF
            temp.setAvgIDF(avgIDF / sfs.length);
        }
    } else if (fValue.equals("PLN")) {
        // [0, 1]
        double s = 0.5;
        // Iterate all the documents to get the average document length.
        double navg = 0;
        for (int k = 0; k < N; k++) navg += docs.get(k).getTotalDocLength();
        navg /= N;
        for (int i = 0; i < docs.size(); i++) {
            _Doc temp = docs.get(i);
            _SparseFeature[] sfs = temp.getSparse();
            double n = temp.getTotalDocLength() / navg, avgIDF = 0;
            for (_SparseFeature sf : sfs) {
                String featureName = m_featureNames.get(sf.getIndex());
                _stat stat = m_featureStat.get(featureName);
                // sf.setTF(sf.getValue());
                double TF = sf.getValue();
                double DF = Utils.sumOfArray(stat.getDF());
                double IDF = Math.log((N + 1) / DF);
                double PLN = (1 + Math.log(1 + Math.log(TF)) / (1 - s + s * n)) * IDF;
                sf.setValue(PLN);
                avgIDF += IDF;
            }
            // compute average IDF
            temp.setAvgIDF(avgIDF / sfs.length);
        }
    } else {
        System.out.println("No feature value is set, keep the raw count of every feature in setFeatureValues().");
        // the original feature is raw TF
        for (int i = 0; i < docs.size(); i++) {
            _Doc temp = docs.get(i);
            _SparseFeature[] sfs = temp.getSparse();
            double avgIDF = 0;
            for (_SparseFeature sf : sfs) {
                String featureName = m_featureNames.get(sf.getIndex());
                _stat stat = m_featureStat.get(featureName);
                // sf.setTF(sf.getValue());
                double DF = Utils.sumOfArray(stat.getDF());
                double IDF = Math.log((N + 1) / DF);
                avgIDF += IDF;
            }
            // compute average IDF
            temp.setAvgIDF(avgIDF / sfs.length);
        }
    }
    // Collections.sort(m_corpus.getCollection());
    if (norm == 1) {
        for (_Doc d : docs) Utils.L1Normalization(d.getSparse());
    } else if (norm == 2) {
        for (_Doc d : docs) Utils.L2Normalization(d.getSparse());
    } else
        System.out.println("No normalizaiton is adopted here or wrong parameters in setFeatureValues()!");
    System.out.format("Text feature generated for %d documents...\n", m_corpus.getSize());
}
Also used : structures._stat(structures._stat) structures._Doc(structures._Doc) structures._SparseFeature(structures._SparseFeature)

Example 5 with structures._stat

use of structures._stat in project IR_Base by Linda-sunshine.

the class Analyzer method rollBack.

void rollBack(HashMap<Integer, Double> spVct, int y) {
    if (!m_isCVLoaded) {
        for (int index : spVct.keySet()) {
            String token = "";
            if (m_featureNames.contains(index)) {
                token = m_featureNames.get(index);
                _stat stat = m_featureStat.get(token);
                if (Utils.sumOfArray(stat.getDF()) == 1) {
                    // If the feature is the first time to show in feature set.
                    m_featureNameIndex.remove(index);
                    m_featureStat.remove(token);
                    m_featureNames.remove(index);
                } else {
                    // If the feature is not the first time to show in feature set.
                    stat.minusOneDF(y);
                    stat.minusNTTF(y, spVct.get(index));
                }
            }
        }
    } else {
        // If CV is loaded and CV's statistics are loaded from file, no need to change it
        if (m_isCVStatLoaded)
            return;
        // otherwise, we can minus the DF and TTF directly.
        for (int index : spVct.keySet()) {
            String token = m_featureNames.get(index);
            _stat stat = m_featureStat.get(token);
            stat.minusOneDF(y);
            stat.minusNTTF(y, spVct.get(index));
        }
    }
}
Also used : structures._stat(structures._stat)

Aggregations

structures._stat (structures._stat)9 structures._RankItem (structures._RankItem)3 File (java.io.File)1 FileNotFoundException (java.io.FileNotFoundException)1 PrintWriter (java.io.PrintWriter)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 structures._Doc (structures._Doc)1 structures._SparseFeature (structures._SparseFeature)1