use of structures._RankItem in project IR_Base by Linda-sunshine.
the class L2RMetricLearning method createTrainingCorpus.
// In this training process, we want to get the weight of all pairs of samples.
protected int createTrainingCorpus() {
// pre-compute the similarity between labeled documents
calcLabeledSimilarities();
MyPriorityQueue<_RankItem> simRanker = new MyPriorityQueue<_RankItem>(m_topK);
ArrayList<_Doc> neighbors = new ArrayList<_Doc>();
_Query q;
_Doc di, dj;
int posQ = 0, negQ = 0, pairSize = 0;
int relevant = 0, irrelevant = 0;
for (int i = 0; i < m_trainSet.size(); i++) {
// candidate query document
di = m_trainSet.get(i);
relevant = 0;
irrelevant = 0;
// using content similarity to construct initial ranking
for (int j = 0; j < m_trainSet.size(); j++) {
if (i == j)
continue;
dj = m_trainSet.get(j);
simRanker.add(new _RankItem(j, m_LabeledCache[getIndex(i, j)]));
}
// find the top K similar documents by default similarity measure
for (_RankItem it : simRanker) {
dj = m_trainSet.get(it.m_index);
neighbors.add(dj);
if (di.getYLabel() == dj.getYLabel())
relevant++;
else
irrelevant++;
}
// inject some random neighbors
int j = 0;
while (neighbors.size() < (1.0 + m_noiseRatio) * m_topK) {
if (i != j) {
dj = m_trainSet.get(j);
if (Math.random() < 0.02 && !neighbors.contains(dj)) {
neighbors.add(dj);
if (di.getYLabel() == dj.getYLabel())
relevant++;
else
irrelevant++;
}
}
// until we use up all the random budget
j = (j + 1) % m_trainSet.size();
}
if (relevant == 0 || irrelevant == 0 || (di.getYLabel() == 1 && negQ < 1.1 * posQ)) {
// clear the cache for next query
simRanker.clear();
neighbors.clear();
continue;
} else if (di.getYLabel() == 1)
posQ++;
else
negQ++;
// accept the query
q = new _Query();
m_queries.add(q);
// construct features for the most similar documents with respect to the query di
for (_Doc d : neighbors) q.addQUPair(new _QUPair(d.getYLabel() == di.getYLabel() ? 1 : 0, genRankingFV(di, d)));
pairSize += q.createRankingPairs();
// clear the cache for next query
simRanker.clear();
neighbors.clear();
}
// normalize the features by z-score
normalize();
System.out.format("Generate %d(%d:%d) ranking pairs for L2R model training...\n", pairSize, posQ, negQ);
return pairSize;
}
use of structures._RankItem in project IR_Base by Linda-sunshine.
the class DCMLDA4AC_test method printWordTopicDistribution.
protected void printWordTopicDistribution(_Doc d, File wordTopicDistributionFolder, int k) {
_ParentDoc4DCM pDoc = (_ParentDoc4DCM) d;
String wordTopicDistributionFile = pDoc.getName() + ".txt";
try {
PrintWriter pw = new PrintWriter(new File(wordTopicDistributionFolder, wordTopicDistributionFile));
for (int i = 0; i < number_of_topics; i++) {
MyPriorityQueue<_RankItem> fVector = new MyPriorityQueue<_RankItem>(k);
for (int v = 0; v < vocabulary_size; v++) {
String featureName = m_corpus.getFeature(v);
double wordProb = pDoc.m_wordTopic_prob[i][v];
_RankItem ri = new _RankItem(featureName, wordProb);
fVector.add(ri);
}
pw.format("Topic %d(%.5f):\t", i, d.m_topics[i]);
for (_RankItem it : fVector) pw.format("%s(%.5f)\t", it.m_name, m_logSpace ? Math.exp(it.m_value) : it.m_value);
pw.write("\n");
}
pw.flush();
pw.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
use of structures._RankItem in project IR_Base by Linda-sunshine.
the class DCMLDA4AC_test method printTopBeta.
public void printTopBeta(int k, String topWordPath) {
try {
PrintWriter topWordWriter = new PrintWriter(new File(topWordPath));
for (int i = 0; i < m_beta.length; i++) {
MyPriorityQueue<_RankItem> fVector = new MyPriorityQueue<_RankItem>(k);
for (int j = 0; j < vocabulary_size; j++) fVector.add(new _RankItem(m_corpus.getFeature(j), m_beta[i][j]));
topWordWriter.format("Topic %d(%.5f):\t", i, m_sstat[i]);
for (_RankItem it : fVector) topWordWriter.format("%s(%.5f)\t", it.m_name, m_logSpace ? Math.exp(it.m_value) : it.m_value);
topWordWriter.write("\n");
}
topWordWriter.close();
} catch (Exception ex) {
System.err.print("File Not Found");
}
}
use of structures._RankItem in project IR_Base by Linda-sunshine.
the class DCMLDA4AC_test method printTopWordsDistribution.
protected void printTopWordsDistribution(int topK, String topWordFile) {
Arrays.fill(m_sstat, 0);
System.out.println("print top words");
for (_Doc d : m_trainSet) {
for (int i = 0; i < number_of_topics; i++) m_sstat[i] += m_logSpace ? Math.exp(d.m_topics[i]) : d.m_topics[i];
}
Utils.L1Normalization(m_sstat);
try {
System.out.println("top word file");
PrintWriter betaOut = new PrintWriter(new File(topWordFile));
for (int i = 0; i < m_topic_word_prob.length; i++) {
MyPriorityQueue<_RankItem> fVector = new MyPriorityQueue<_RankItem>(topK);
for (int j = 0; j < vocabulary_size; j++) fVector.add(new _RankItem(m_corpus.getFeature(j), m_topic_word_prob[i][j]));
betaOut.format("Topic %d(%.3f):\t", i, m_sstat[i]);
for (_RankItem it : fVector) {
betaOut.format("%s(%.3f)\t", it.m_name, m_logSpace ? Math.exp(it.m_value) : it.m_value);
System.out.format("%s(%.3f)\t", it.m_name, m_logSpace ? Math.exp(it.m_value) : it.m_value);
}
betaOut.println();
System.out.println();
}
betaOut.flush();
betaOut.close();
} catch (Exception ex) {
System.err.print("File Not Found");
}
}
use of structures._RankItem in project IR_Base by Linda-sunshine.
the class LDAGibbs4AC_test method printTopWords.
public void printTopWords(int k, String betaFile) {
double loglikelihood = calculate_log_likelihood();
System.out.format("Final Log Likelihood %.3f\t", loglikelihood);
String filePrefix = betaFile.replace("topWords.txt", "");
debugOutput(filePrefix);
Arrays.fill(m_sstat, 0);
System.out.println("print top words");
for (_Doc d : m_trainSet) {
for (int i = 0; i < number_of_topics; i++) m_sstat[i] += m_logSpace ? Math.exp(d.m_topics[i]) : d.m_topics[i];
}
Utils.L1Normalization(m_sstat);
try {
System.out.println("beta file");
PrintWriter betaOut = new PrintWriter(new File(betaFile));
for (int i = 0; i < topic_term_probabilty.length; i++) {
MyPriorityQueue<_RankItem> fVector = new MyPriorityQueue<_RankItem>(k);
for (int j = 0; j < vocabulary_size; j++) fVector.add(new _RankItem(m_corpus.getFeature(j), topic_term_probabilty[i][j]));
betaOut.format("Topic %d(%.3f):\t", i, m_sstat[i]);
for (_RankItem it : fVector) {
betaOut.format("%s(%.3f)\t", it.m_name, m_logSpace ? Math.exp(it.m_value) : it.m_value);
System.out.format("%s(%.3f)\t", it.m_name, m_logSpace ? Math.exp(it.m_value) : it.m_value);
}
betaOut.println();
System.out.println();
}
betaOut.flush();
betaOut.close();
} catch (Exception ex) {
System.err.print("File Not Found");
}
}
Aggregations