use of structures._SparseFeature in project IR_Base by Linda-sunshine.
the class Analyzer method setFeatureValues.
// Give the option, which would be used as the method to calculate feature value and returned corpus, calculate the feature values.
public void setFeatureValues(String fValue, int norm) {
// Get the collection of all the documents.
ArrayList<_Doc> docs = m_corpus.getCollection();
int N = m_isCVStatLoaded ? m_TotalDF : docs.size();
if (fValue.equals("TFIDF")) {
for (int i = 0; i < docs.size(); i++) {
_Doc temp = docs.get(i);
_SparseFeature[] sfs = temp.getSparse();
double avgIDF = 0;
for (_SparseFeature sf : sfs) {
String featureName = m_featureNames.get(sf.getIndex());
_stat stat = m_featureStat.get(featureName);
sf.setTF(sf.getValue());
// normalized TF
double TF = sf.getValue() / temp.getTotalDocLength();
double DF = Utils.sumOfArray(stat.getDF());
double IDF = Math.log((N + 1) / DF);
double TFIDF = TF * IDF;
sf.setValue(TFIDF);
avgIDF += IDF;
}
// compute average IDF
temp.setAvgIDF(avgIDF / sfs.length);
}
} else if (fValue.equals("TFIDF-sublinear")) {
for (int i = 0; i < docs.size(); i++) {
_Doc temp = docs.get(i);
_SparseFeature[] sfs = temp.getSparse();
double avgIDF = 0;
for (_SparseFeature sf : sfs) {
String featureName = m_featureNames.get(sf.getIndex());
_stat stat = m_featureStat.get(featureName);
// sf.setTF(sf.getValue());
// sublinear TF
double TF = 1 + Math.log10(sf.getValue());
double DF = Utils.sumOfArray(stat.getDF());
double IDF = 1 + Math.log10(N / DF);
double TFIDF = TF * IDF;
sf.setValue(TFIDF);
avgIDF += IDF;
}
// compute average IDF
temp.setAvgIDF(avgIDF / sfs.length);
}
} else if (fValue.equals("BM25")) {
// [1.2, 2]
double k1 = 1.5;
// (0, 1000]
double b = 0.75;
// Iterate all the documents to get the average document length.
double navg = 0;
for (int k = 0; k < N; k++) navg += docs.get(k).getTotalDocLength();
navg /= N;
for (int i = 0; i < docs.size(); i++) {
_Doc temp = docs.get(i);
_SparseFeature[] sfs = temp.getSparse();
double n = temp.getTotalDocLength() / navg, avgIDF = 0;
for (_SparseFeature sf : sfs) {
String featureName = m_featureNames.get(sf.getIndex());
_stat stat = m_featureStat.get(featureName);
// sf.setTF(sf.getValue());
double TF = sf.getValue();
double DF = Utils.sumOfArray(stat.getDF());
double IDF = Math.log((N - DF + 0.5) / (DF + 0.5));
double BM25 = IDF * TF * (k1 + 1) / (k1 * (1 - b + b * n) + TF);
sf.setValue(BM25);
avgIDF += IDF;
}
// compute average IDF
temp.setAvgIDF(avgIDF / sfs.length);
}
} else if (fValue.equals("PLN")) {
// [0, 1]
double s = 0.5;
// Iterate all the documents to get the average document length.
double navg = 0;
for (int k = 0; k < N; k++) navg += docs.get(k).getTotalDocLength();
navg /= N;
for (int i = 0; i < docs.size(); i++) {
_Doc temp = docs.get(i);
_SparseFeature[] sfs = temp.getSparse();
double n = temp.getTotalDocLength() / navg, avgIDF = 0;
for (_SparseFeature sf : sfs) {
String featureName = m_featureNames.get(sf.getIndex());
_stat stat = m_featureStat.get(featureName);
// sf.setTF(sf.getValue());
double TF = sf.getValue();
double DF = Utils.sumOfArray(stat.getDF());
double IDF = Math.log((N + 1) / DF);
double PLN = (1 + Math.log(1 + Math.log(TF)) / (1 - s + s * n)) * IDF;
sf.setValue(PLN);
avgIDF += IDF;
}
// compute average IDF
temp.setAvgIDF(avgIDF / sfs.length);
}
} else {
System.out.println("No feature value is set, keep the raw count of every feature in setFeatureValues().");
// the original feature is raw TF
for (int i = 0; i < docs.size(); i++) {
_Doc temp = docs.get(i);
_SparseFeature[] sfs = temp.getSparse();
double avgIDF = 0;
for (_SparseFeature sf : sfs) {
String featureName = m_featureNames.get(sf.getIndex());
_stat stat = m_featureStat.get(featureName);
// sf.setTF(sf.getValue());
double DF = Utils.sumOfArray(stat.getDF());
double IDF = Math.log((N + 1) / DF);
avgIDF += IDF;
}
// compute average IDF
temp.setAvgIDF(avgIDF / sfs.length);
}
}
// Collections.sort(m_corpus.getCollection());
if (norm == 1) {
for (_Doc d : docs) Utils.L1Normalization(d.getSparse());
} else if (norm == 2) {
for (_Doc d : docs) Utils.L2Normalization(d.getSparse());
} else
System.out.println("No normalizaiton is adopted here or wrong parameters in setFeatureValues()!");
System.out.format("Text feature generated for %d documents...\n", m_corpus.getSize());
}
use of structures._SparseFeature in project IR_Base by Linda-sunshine.
the class MTCLRWithMMB method gradientByFunc.
@Override
protected void gradientByFunc(_AdaptStruct u, _Doc review, double weight, double[] g) {
_Review r = (_Review) review;
// feature index
int n;
int cIndex = r.getHDPThetaStar().getIndex();
if (cIndex < 0 || cIndex >= m_kBar)
System.err.println("Error,cannot find the theta star!");
int offset = m_dim * cIndex;
int offsetSup = m_dim * m_kBar;
double delta = weight * (r.getYLabel() - logit(r.getSparse(), r));
// Bias term.
// x0=1, each cluster.
g[offset] -= delta;
// super model.
g[offsetSup] -= m_q * delta;
// Traverse all the feature dimension to calculate the gradient.
for (_SparseFeature fv : review.getSparse()) {
n = fv.getIndex() + 1;
// cluster model.
g[offset + n] -= delta * fv.getValue();
// super model.
g[offsetSup + n] -= delta * fv.getValue() * m_q;
}
}
use of structures._SparseFeature in project IR_Base by Linda-sunshine.
the class MTCLinAdaptWithMMB method logit.
// Logit function is different from the father class.
@Override
protected double logit(_SparseFeature[] fvs, _Review r) {
int k, n;
double[] Au = r.getHDPThetaStar().getModel();
// Bias term: w_s0*a0+b0.
double sum = Au[0] * getSupWeights(0) + Au[m_dim];
for (_SparseFeature fv : fvs) {
n = fv.getIndex() + 1;
k = m_featureGroupMap[n];
sum += (Au[k] * getSupWeights(n) + Au[m_dim + k]) * fv.getValue();
}
return Utils.logistic(sum);
}
use of structures._SparseFeature in project IR_Base by Linda-sunshine.
the class _MMBAdaptStruct method evaluate.
@Override
public double evaluate(_Doc doc) {
_Review r = (_Review) doc;
double prob = 0, sum = 0;
double[] probs = r.getCluPosterior();
int n, m, k;
// not adaptation based
if (m_dim == 0) {
for (k = 0; k < probs.length; k++) {
// need to be fixed: here we assumed binary classification
sum = Utils.dotProduct(CLRWithMMB.m_hdpThetaStars[k].getModel(), doc.getSparse(), 0);
if (MTCLRWithMMB.m_supWeights != null && CLRWithDP.m_q != 0)
sum += CLRWithDP.m_q * Utils.dotProduct(MTCLRWithMMB.m_supWeights, doc.getSparse(), 0);
// to maintain numerical precision, compute the expectation in log space as well
if (k == 0)
prob = probs[k] + Math.log(Utils.logistic(sum));
else
prob = Utils.logSum(prob, probs[k] + Math.log(Utils.logistic(sum)));
}
} else {
double[] As;
for (k = 0; k < probs.length; k++) {
As = CLRWithMMB.m_hdpThetaStars[k].getModel();
// Bias term: w_s0*a0+b0.
sum = As[0] * CLinAdaptWithMMB.m_supWeights[0] + As[m_dim];
for (_SparseFeature fv : doc.getSparse()) {
n = fv.getIndex() + 1;
m = m_featureGroupMap[n];
sum += (As[m] * CLinAdaptWithMMB.m_supWeights[n] + As[m_dim + m]) * fv.getValue();
}
// to maintain numerical precision, compute the expectation in log space as well
if (k == 0)
prob = probs[k] + Math.log(Utils.logistic(sum));
else
prob = Utils.logSum(prob, probs[k] + Math.log(Utils.logistic(sum)));
}
}
// accumulate the prediction results during sampling procedure
doc.m_pCount++;
// >0.5?1:0;
doc.m_prob += Math.exp(prob);
return prob;
}
use of structures._SparseFeature in project IR_Base by Linda-sunshine.
the class MTRegLR method gradientByFunc.
@Override
protected void gradientByFunc(_AdaptStruct user, _Doc review, double weight) {
int n, uOffset, gOffset;
uOffset = (m_featureSize + 1) * user.getId();
gOffset = (m_featureSize + 1) * m_userList.size();
double delta = weight * (review.getYLabel() - logit(review.getSparse(), user));
if (m_LNormFlag)
delta /= getAdaptationSize(user);
// Bias term.
// a[0] = w0*x0; x0=1
m_g[uOffset] -= delta;
// offset for the global part.
m_g[gOffset] -= m_u * delta;
// Traverse all the feature dimension to calculate the gradient.
for (_SparseFeature fv : review.getSparse()) {
n = fv.getIndex() + 1;
// User part.
m_g[uOffset + n] -= delta * fv.getValue();
// Global part.
m_g[gOffset + n] -= delta * m_u * fv.getValue();
}
}
Aggregations