use of structures._SparseFeature in project IR_Base by Linda-sunshine.
the class MultiThreadedLMAnalyzer method estimateGlobalLM.
// Estimate a global language model.
// We traverse all review documents instead of using the global TF
public double[] estimateGlobalLM() {
double[] lm = new double[getLMFeatureSize()];
double sum = 0;
for (_User u : m_users) {
for (_Review r : u.getReviews()) {
for (_SparseFeature fv : r.getLMSparse()) {
lm[fv.getIndex()] += fv.getValue();
sum += fv.getValue();
}
}
}
for (int i = 0; i < lm.length; i++) {
lm[i] /= sum;
if (lm[i] == 0)
lm[i] = 0.0001;
}
return lm;
}
use of structures._SparseFeature in project IR_Base by Linda-sunshine.
the class VctAnalyzer method LoadDoc.
@Override
public void LoadDoc(String filename) {
try {
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(filename), "UTF-8"));
String line;
String[] container, entry;
_SparseFeature[] spVct;
_Doc doc;
int maxFvIndex = 0, index;
while ((line = reader.readLine()) != null) {
container = line.split(" ");
if (// || Math.random() < 0.65
container.length <= m_lengthThreshold)
continue;
doc = new _Doc(m_corpus.getSize(), null, Integer.valueOf(container[0]));
if (!line.contains("#"))
spVct = new _SparseFeature[container.length - 1];
else
// exclude the comment
spVct = new _SparseFeature[container.length - 2];
for (int i = 1; i < container.length; i++) {
if (container[i].startsWith("#")) {
// parse the comment part for this review
entry = container[i].split("-");
doc.setItemID(entry[0].substring(1));
doc.setName(entry[1]);
} else {
entry = container[i].split(":");
// the loaded feature index starts from 1
index = Integer.valueOf(entry[0]) - 1;
spVct[i - 1] = new _SparseFeature(index, Double.valueOf(entry[1]));
if (index > maxFvIndex)
maxFvIndex = index;
}
}
doc.setSpVct(spVct);
m_corpus.addDoc(doc);
m_classMemberNo[doc.getYLabel()]++;
}
reader.close();
reviseCV(maxFvIndex);
System.out.format("Loading %d vector files with %d features from %s...\n", m_corpus.getSize(), m_featureNames.size(), filename);
} catch (IOException e) {
System.err.format("[Error]Failed to open file %s!!", filename);
}
}
use of structures._SparseFeature in project IR_Base by Linda-sunshine.
the class LinearSVMMetricLearning method createLinearFeature_diff.
// Calculate the new sample according to two documents.
// Since cross-product will be symmetric, we don't need to store the whole matrix
Feature[] createLinearFeature_diff(_Doc d1, _Doc d2) {
_SparseFeature[] fv1 = d1.getProjectedFv(), fv2 = d2.getProjectedFv();
if (fv1 == null || fv2 == null)
return null;
_SparseFeature[] diffVct = Utils.diffVector(fv1, fv2);
Feature[] features = new Feature[diffVct.length * (diffVct.length + 1) / 2];
int pi, pj, spIndex = 0;
double value = 0;
for (int i = 0; i < diffVct.length; i++) {
pi = diffVct[i].getIndex();
for (int j = 0; j < i; j++) {
pj = diffVct[j].getIndex();
// Currently, we use one dimension array to represent V*V features
// this might be too small to count
value = 2 * diffVct[i].getValue() * diffVct[j].getValue();
features[spIndex++] = new FeatureNode(getIndex(pi, pj), value);
}
// this might be too small to count
value = diffVct[i].getValue() * diffVct[i].getValue();
features[spIndex++] = new FeatureNode(getIndex(pi, pi), value);
}
return features;
}
use of structures._SparseFeature in project IR_Base by Linda-sunshine.
the class NaiveBayesEM method MStep.
void MStep(Collection<_Doc> trainSet, int iter) {
super.init();
for (_Doc doc : trainSet) {
if (doc.getSourceType() == 2) {
// labeled data
int label = doc.getYLabel();
m_pY[label]++;
for (_SparseFeature sf : doc.getSparse()) m_Pxy[label][sf.getIndex()] += m_presence ? 1.0 : sf.getValue();
} else if (iter > 0 && doc.getSourceType() == 1) {
// unlabeled data
double[] label = doc.m_sstat;
for (int i = 0; i < m_classNo; i++) {
m_pY[i] += label[i];
for (_SparseFeature sf : doc.getSparse()) m_Pxy[i][sf.getIndex()] += (m_presence ? 1.0 : sf.getValue()) * label[i];
}
}
}
// normalization
double sumY = Math.log(Utils.sumOfArray(m_pY) + m_deltaY * m_classNo);
for (int i = 0; i < m_classNo; i++) {
m_pY[i] = Math.log(m_pY[i] + m_deltaY) - sumY;
double sumX = Math.log(Utils.sumOfArray(m_Pxy[i]) + m_featureSize * m_deltaXY);
for (int j = 0; j < m_featureSize; j++) m_Pxy[i][j] = Math.log(m_deltaXY + m_Pxy[i][j]) - sumX;
}
}
use of structures._SparseFeature in project IR_Base by Linda-sunshine.
the class NaiveBayes method train.
// Train the data set.
@Override
public double train(Collection<_Doc> trainSet) {
init();
for (_Doc doc : trainSet) {
int label = doc.getYLabel();
m_pY[label]++;
for (_SparseFeature sf : doc.getSparse()) m_Pxy[label][sf.getIndex()] += m_presence ? 1.0 : sf.getValue();
}
// normalization
for (int i = 0; i < m_classNo; i++) {
// up to a constant since normalization of this is not important
m_pY[i] = Math.log(m_pY[i] + m_deltaY);
double sum = Math.log(Utils.sumOfArray(m_Pxy[i]) + m_featureSize * m_deltaXY);
for (int j = 0; j < m_featureSize; j++) m_Pxy[i][j] = Math.log(m_deltaXY + m_Pxy[i][j]) - sum;
}
// we should compute the log-likelihood
return 0;
}
Aggregations