use of structures._Word in project IR_Base by Linda-sunshine.
the class sparseClusterDCMLDA method sampleTopicAssignment.
protected void sampleTopicAssignment(_Doc4SparseDCMLDA DCMDoc) {
int wid, tid;
double p;
int clusterIndex = DCMDoc.m_clusterIndicator;
for (_Word w : DCMDoc.getWords()) {
wid = w.getIndex();
tid = w.getTopic();
DCMDoc.m_sstat[tid]--;
DCMDoc.m_wordTopic_stat[tid][wid]--;
if (m_collectCorpusStats) {
word_topic_sstat[tid][wid]--;
m_clusterTopicStats[clusterIndex][tid]--;
m_clusterTopicWordStats[clusterIndex][tid][wid]--;
}
p = 0;
double denominator = 0;
denominator += DCMDoc.m_alphaDoc;
denominator += Utils.sumOfArray(DCMDoc.m_sstat);
for (tid = 0; tid < number_of_topics; tid++) {
m_topicProbCache[tid] = 0;
if (DCMDoc.m_topicIndicator[tid] == false)
continue;
double term1 = 0;
term1 = topicInDocProb(tid, denominator, DCMDoc);
if (term1 < 0) {
System.out.println("negative1\t" + term1);
}
term1 = wordTopicProb(tid, wid, clusterIndex);
if (term1 < 0) {
System.out.println("negative2\t" + term1);
}
m_topicProbCache[tid] = topicInDocProb(tid, denominator, DCMDoc) * wordTopicProb(tid, wid, clusterIndex);
p += m_topicProbCache[tid];
}
p *= m_rand.nextDouble();
tid = -1;
if (p <= 0) {
// for(int k=0; k<number_of_topics; k++)
// System.out.println(m_alpha[k]+"\t"+m_totalBeta[k]);
System.out.println(p + "\t" + DCMDoc.getName() + "\t" + DCMDoc.m_indicatorTrue_stat);
}
while (p > 0 && tid < number_of_topics - 1) {
tid++;
p -= m_topicProbCache[tid];
}
w.setTopic(tid);
DCMDoc.m_sstat[tid]++;
DCMDoc.m_wordTopic_stat[tid][wid]++;
if (m_collectCorpusStats) {
word_topic_sstat[tid][wid]++;
m_clusterTopicStats[clusterIndex][tid]++;
m_clusterTopicWordStats[clusterIndex][tid][wid]++;
}
}
}
use of structures._Word in project IR_Base by Linda-sunshine.
the class sparseClusterDCMLDA method sampleClusterIndex.
public void sampleClusterIndex(_Doc d) {
_Doc4SparseDCMLDA DCMDoc = (_Doc4SparseDCMLDA) d;
double p = 0;
Arrays.fill(m_clusterSamplingCache, 0);
int clusterIndex = DCMDoc.m_clusterIndicator;
m_clusterStats[clusterIndex]--;
for (_Word w : DCMDoc.getWords()) {
int wid = w.getIndex();
int tid = w.getTopic();
m_clusterTopicWordStats[clusterIndex][tid][wid]--;
m_clusterTopicStats[clusterIndex][tid]--;
}
double avgClusterProb = 0;
double maxClusterProb = 0;
double minClusterProb = 0;
for (int c = 0; c < m_clusterNum; c++) {
double term1 = wordByClusterProb(d, c);
term1 = clusterProb(c);
m_clusterSamplingCache[c] = wordByClusterProb(d, c) + clusterProb(c);
avgClusterProb = (avgClusterProb * (c) + m_clusterSamplingCache[c]) / ((c + 1) * 1.0);
if (c == 0)
maxClusterProb = m_clusterSamplingCache[c];
else if (m_clusterSamplingCache[c] > maxClusterProb)
maxClusterProb = m_clusterSamplingCache[c];
}
boolean overflowFlag = false;
for (int c = 0; c < m_clusterNum; c++) {
m_clusterSamplingCache[c] = Math.exp(m_clusterSamplingCache[c] - maxClusterProb);
if (m_clusterSamplingCache[c] > Double.MAX_VALUE) {
clusterIndex = c;
System.out.println("maximum overflow\t" + Double.MAX_VALUE);
overflowFlag = true;
break;
} else {
if (m_clusterSamplingCache[c] < Double.MIN_VALUE) {
m_clusterSamplingCache[c] = 0;
}
}
p += m_clusterSamplingCache[c];
}
if (overflowFlag) {
DCMDoc.m_clusterIndicator = clusterIndex;
m_clusterStats[clusterIndex]++;
for (_Word w : DCMDoc.getWords()) {
int wid = w.getIndex();
int tid = w.getTopic();
m_clusterTopicWordStats[clusterIndex][tid][wid]++;
m_clusterTopicStats[clusterIndex][tid]++;
}
return;
}
p *= m_rand.nextDouble();
for (clusterIndex = 0; clusterIndex < m_clusterNum; clusterIndex++) {
p -= m_clusterSamplingCache[clusterIndex];
if (p <= 0) {
break;
}
}
if (clusterIndex >= m_clusterNum) {
System.out.println("p\t" + p);
for (int c = 0; c < m_clusterNum; c++) System.out.println("c\t" + m_clusterSamplingCache[c]);
}
DCMDoc.m_clusterIndicator = clusterIndex;
m_clusterStats[clusterIndex]++;
for (_Word w : DCMDoc.getWords()) {
int wid = w.getIndex();
int tid = w.getTopic();
m_clusterTopicWordStats[clusterIndex][tid][wid]++;
m_clusterTopicStats[clusterIndex][tid]++;
}
}
use of structures._Word in project IR_Base by Linda-sunshine.
the class sparseClusterDCMLDA_test method printParentTopicAssignment.
protected void printParentTopicAssignment(_Doc d, File topicFolder) {
_Doc4SparseDCMLDA DCMDoc = (_Doc4SparseDCMLDA) d;
String topicAssignmentFile = DCMDoc.getName() + ".txt";
try {
PrintWriter pw = new PrintWriter(new File(topicFolder, topicAssignmentFile));
pw.println("cluster\t" + DCMDoc.m_clusterIndicator);
for (_Word w : DCMDoc.getWords()) {
int index = w.getIndex();
int topic = w.getTopic();
String featureName = m_corpus.getFeature(index);
pw.print(featureName + ":" + topic + "\t");
}
pw.flush();
pw.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
use of structures._Word in project IR_Base by Linda-sunshine.
the class sparseDCMLDA method sampleTopicAssignment.
protected void sampleTopicAssignment(_Doc4SparseDCMLDA DCMDoc) {
int wid, tid;
double p;
for (_Word w : DCMDoc.getWords()) {
wid = w.getIndex();
tid = w.getTopic();
DCMDoc.m_sstat[tid]--;
DCMDoc.m_wordTopic_stat[tid][wid]--;
if (m_collectCorpusStats)
word_topic_sstat[tid][wid]--;
p = 0;
double denominator = 0;
denominator += DCMDoc.m_alphaDoc;
denominator += Utils.sumOfArray(DCMDoc.m_sstat);
for (tid = 0; tid < number_of_topics; tid++) {
m_topicProbCache[tid] = 0;
if (DCMDoc.m_topicIndicator[tid] == false)
continue;
m_topicProbCache[tid] = topicInDocProb(tid, denominator, DCMDoc) * wordTopicProb(tid, wid, DCMDoc);
if (m_topicProbCache[tid] < 0)
System.out.println("negative\t" + m_topicProbCache[tid]);
p += m_topicProbCache[tid];
}
p *= m_rand.nextDouble();
tid = 0;
while (p > 0 && tid < number_of_topics - 1) {
p -= m_topicProbCache[tid];
tid++;
}
w.setTopic(tid);
DCMDoc.m_sstat[tid]++;
DCMDoc.m_wordTopic_stat[tid][wid]++;
if (m_collectCorpusStats)
word_topic_sstat[tid][wid]++;
}
}
use of structures._Word in project IR_Base by Linda-sunshine.
the class sparseDCMLDA_test method printParentTopicAssignment.
protected void printParentTopicAssignment(_Doc d, File topicFolder) {
String topicAssignmentFile = d.getName() + ".txt";
try {
PrintWriter pw = new PrintWriter(new File(topicFolder, topicAssignmentFile));
for (_Word w : d.getWords()) {
int index = w.getIndex();
int topic = w.getTopic();
String featureName = m_corpus.getFeature(index);
pw.print(featureName + ":" + topic + "\t");
}
pw.flush();
pw.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
Aggregations