use of structures._stat in project IR_Base by Linda-sunshine.
the class AspectAnalyzer method expandKeywordsByChi.
boolean expandKeywordsByChi(double ratio) {
int selID = -1, aspectSize = m_aspects.size(), N = Utils.sumOfArray(m_aspectDist), DF;
double maxChi, chiV;
int[] DFarray;
_Aspect aspect;
Iterator<Map.Entry<String, _stat>> it = m_featureStat.entrySet().iterator();
while (it.hasNext()) {
// set aspect assignment for each word
Map.Entry<String, _stat> entry = it.next();
_stat temp = entry.getValue();
DFarray = temp.getDF();
DF = Utils.sumOfArray(DFarray);
maxChi = 0.0;
selID = -1;
for (int i = 0; i < aspectSize; i++) {
chiV = Utils.ChiSquare(N, DF, DFarray[i], m_aspectDist[i]);
if (chiV > ratio * maxChi) {
maxChi = chiV;
selID = i;
}
}
if (selID > -1) {
// expand candidate keyword list in the selected aspect
aspect = m_aspects.get(selID);
aspect.addCandidateKeyword(m_featureNameIndex.get(entry.getKey()), maxChi);
}
}
boolean extended = false;
for (int i = 0; i < aspectSize; i++) {
// expand each aspect accordingly
aspect = m_aspects.get(i);
extended |= aspect.expandKeywords();
}
return extended;
}
use of structures._stat in project IR_Base by Linda-sunshine.
the class FeatureSelector method IG.
// Feature Selection -- IG.
public void IG(HashMap<String, _stat> featureStat, int[] classMemberNo) {
m_selectedFeatures.clear();
double classMemberSum = Utils.sumOfArray(classMemberNo);
// I
double[] PrCi = new double[classMemberNo.length];
// II
double[] PrCit = new double[classMemberNo.length];
// III
double[] PrCitNot = new double[classMemberNo.length];
double Prt = 0, PrtNot = 0;
// IG
double Gt = 0;
double PrCiSum = 0, PrCitSum = 0, PrCitNotSum = 0;
// - $sigma$PrCi * log PrCi
for (int i = 0; i < classMemberNo.length; i++) {
PrCi[i] = classMemberNo[i] / classMemberSum;
if (PrCi[i] != 0) {
PrCiSum -= PrCi[i] * Math.log(PrCi[i]);
}
}
for (String f : featureStat.keySet()) {
// Filter the features which have smaller DFs.
int sumDF = Utils.sumOfArray(featureStat.get(f).getDF());
if (sumDF > m_minDF && sumDF < m_maxDF) {
_stat temp = featureStat.get(f);
Prt = Utils.sumOfArray(temp.getDF()) / classMemberSum;
PrtNot = 1 - Prt;
PrCitSum = 0;
PrCitNotSum = 0;
for (int i = 0; i < classMemberNo.length; i++) {
PrCit[i] = ((double) temp.getDF()[i] / classMemberNo[i]) * PrCi[i] / Prt;
PrCitNot[i] = ((double) (classMemberNo[i] - temp.getDF()[i]) / classMemberNo[i]) * PrCi[i] / PrtNot;
if (PrCit[i] != 0) {
PrCitSum += PrCit[i] * Math.log(PrCit[i]);
}
if (PrCitNot[i] != 0) {
PrCitNotSum += PrCitNot[i] * Math.log(PrCi[i]);
}
}
Gt = PrCiSum + PrCitSum + PrCitNotSum;
m_selectedFeatures.add(new _RankItem(f, Gt));
}
}
}
use of structures._stat in project IR_Base by Linda-sunshine.
the class FeatureSelector method MI.
// Feature Selection -- MI.
public void MI(HashMap<String, _stat> featureStat, int[] classMemberNo) {
m_selectedFeatures.clear();
double[] PrCi = new double[classMemberNo.length];
double[] ItCi = new double[classMemberNo.length];
double N = Utils.sumOfArray(classMemberNo);
double Iavg = 0;
for (int i = 0; i < classMemberNo.length; i++) PrCi[i] = classMemberNo[i] / N;
for (String f : featureStat.keySet()) {
// Filter the features which have smaller DFs.
int sumDF = Utils.sumOfArray(featureStat.get(f).getDF());
if (sumDF > m_minDF && sumDF < m_maxDF) {
Iavg = 0;
for (int i = 0; i < classMemberNo.length; i++) {
_stat temp = featureStat.get(f);
double A = temp.getDF()[i];
ItCi[i] = Math.log(A * N / classMemberNo[i] * Utils.sumOfArray(temp.getDF()));
Iavg += ItCi[i] * PrCi[i];
}
m_selectedFeatures.add(new _RankItem(f, Iavg));
}
}
}
use of structures._stat in project IR_Base by Linda-sunshine.
the class Analyzer method setFeatureValues.
// Give the option, which would be used as the method to calculate feature value and returned corpus, calculate the feature values.
public void setFeatureValues(String fValue, int norm) {
// Get the collection of all the documents.
ArrayList<_Doc> docs = m_corpus.getCollection();
int N = m_isCVStatLoaded ? m_TotalDF : docs.size();
if (fValue.equals("TFIDF")) {
for (int i = 0; i < docs.size(); i++) {
_Doc temp = docs.get(i);
_SparseFeature[] sfs = temp.getSparse();
double avgIDF = 0;
for (_SparseFeature sf : sfs) {
String featureName = m_featureNames.get(sf.getIndex());
_stat stat = m_featureStat.get(featureName);
sf.setTF(sf.getValue());
// normalized TF
double TF = sf.getValue() / temp.getTotalDocLength();
double DF = Utils.sumOfArray(stat.getDF());
double IDF = Math.log((N + 1) / DF);
double TFIDF = TF * IDF;
sf.setValue(TFIDF);
avgIDF += IDF;
}
// compute average IDF
temp.setAvgIDF(avgIDF / sfs.length);
}
} else if (fValue.equals("TFIDF-sublinear")) {
for (int i = 0; i < docs.size(); i++) {
_Doc temp = docs.get(i);
_SparseFeature[] sfs = temp.getSparse();
double avgIDF = 0;
for (_SparseFeature sf : sfs) {
String featureName = m_featureNames.get(sf.getIndex());
_stat stat = m_featureStat.get(featureName);
// sf.setTF(sf.getValue());
// sublinear TF
double TF = 1 + Math.log10(sf.getValue());
double DF = Utils.sumOfArray(stat.getDF());
double IDF = 1 + Math.log10(N / DF);
double TFIDF = TF * IDF;
sf.setValue(TFIDF);
avgIDF += IDF;
}
// compute average IDF
temp.setAvgIDF(avgIDF / sfs.length);
}
} else if (fValue.equals("BM25")) {
// [1.2, 2]
double k1 = 1.5;
// (0, 1000]
double b = 0.75;
// Iterate all the documents to get the average document length.
double navg = 0;
for (int k = 0; k < N; k++) navg += docs.get(k).getTotalDocLength();
navg /= N;
for (int i = 0; i < docs.size(); i++) {
_Doc temp = docs.get(i);
_SparseFeature[] sfs = temp.getSparse();
double n = temp.getTotalDocLength() / navg, avgIDF = 0;
for (_SparseFeature sf : sfs) {
String featureName = m_featureNames.get(sf.getIndex());
_stat stat = m_featureStat.get(featureName);
// sf.setTF(sf.getValue());
double TF = sf.getValue();
double DF = Utils.sumOfArray(stat.getDF());
double IDF = Math.log((N - DF + 0.5) / (DF + 0.5));
double BM25 = IDF * TF * (k1 + 1) / (k1 * (1 - b + b * n) + TF);
sf.setValue(BM25);
avgIDF += IDF;
}
// compute average IDF
temp.setAvgIDF(avgIDF / sfs.length);
}
} else if (fValue.equals("PLN")) {
// [0, 1]
double s = 0.5;
// Iterate all the documents to get the average document length.
double navg = 0;
for (int k = 0; k < N; k++) navg += docs.get(k).getTotalDocLength();
navg /= N;
for (int i = 0; i < docs.size(); i++) {
_Doc temp = docs.get(i);
_SparseFeature[] sfs = temp.getSparse();
double n = temp.getTotalDocLength() / navg, avgIDF = 0;
for (_SparseFeature sf : sfs) {
String featureName = m_featureNames.get(sf.getIndex());
_stat stat = m_featureStat.get(featureName);
// sf.setTF(sf.getValue());
double TF = sf.getValue();
double DF = Utils.sumOfArray(stat.getDF());
double IDF = Math.log((N + 1) / DF);
double PLN = (1 + Math.log(1 + Math.log(TF)) / (1 - s + s * n)) * IDF;
sf.setValue(PLN);
avgIDF += IDF;
}
// compute average IDF
temp.setAvgIDF(avgIDF / sfs.length);
}
} else {
System.out.println("No feature value is set, keep the raw count of every feature in setFeatureValues().");
// the original feature is raw TF
for (int i = 0; i < docs.size(); i++) {
_Doc temp = docs.get(i);
_SparseFeature[] sfs = temp.getSparse();
double avgIDF = 0;
for (_SparseFeature sf : sfs) {
String featureName = m_featureNames.get(sf.getIndex());
_stat stat = m_featureStat.get(featureName);
// sf.setTF(sf.getValue());
double DF = Utils.sumOfArray(stat.getDF());
double IDF = Math.log((N + 1) / DF);
avgIDF += IDF;
}
// compute average IDF
temp.setAvgIDF(avgIDF / sfs.length);
}
}
// Collections.sort(m_corpus.getCollection());
if (norm == 1) {
for (_Doc d : docs) Utils.L1Normalization(d.getSparse());
} else if (norm == 2) {
for (_Doc d : docs) Utils.L2Normalization(d.getSparse());
} else
System.out.println("No normalizaiton is adopted here or wrong parameters in setFeatureValues()!");
System.out.format("Text feature generated for %d documents...\n", m_corpus.getSize());
}
use of structures._stat in project IR_Base by Linda-sunshine.
the class Analyzer method rollBack.
void rollBack(HashMap<Integer, Double> spVct, int y) {
if (!m_isCVLoaded) {
for (int index : spVct.keySet()) {
String token = "";
if (m_featureNames.contains(index)) {
token = m_featureNames.get(index);
_stat stat = m_featureStat.get(token);
if (Utils.sumOfArray(stat.getDF()) == 1) {
// If the feature is the first time to show in feature set.
m_featureNameIndex.remove(index);
m_featureStat.remove(token);
m_featureNames.remove(index);
} else {
// If the feature is not the first time to show in feature set.
stat.minusOneDF(y);
stat.minusNTTF(y, spVct.get(index));
}
}
}
} else {
// If CV is loaded and CV's statistics are loaded from file, no need to change it
if (m_isCVStatLoaded)
return;
// otherwise, we can minus the DF and TTF directly.
for (int index : spVct.keySet()) {
String token = m_featureNames.get(index);
_stat stat = m_featureStat.get(token);
stat.minusOneDF(y);
stat.minusNTTF(y, spVct.get(index));
}
}
}
Aggregations