use of structures._ParentDoc4DCM in project IR_Base by Linda-sunshine.
the class DCMCorrLDA method updateBeta.
protected void updateBeta(int tid) {
double diff = 0;
double smoothingBeta = 0.1;
int iteration = 0;
do {
diff = 0;
double deltaBeta = 0;
double wordNum4Tid = 0;
double[] wordNum4Tid4V = new double[vocabulary_size];
double totalBetaDenominator = 0;
double[] totalBetaNumerator = new double[vocabulary_size];
Arrays.fill(totalBetaNumerator, 0);
Arrays.fill(wordNum4Tid4V, 0);
m_totalBeta[tid] = Utils.sumOfArray(m_beta[tid]);
double digBeta4Tid = Utils.digamma(m_totalBeta[tid]);
for (_Doc d : m_trainSet) {
if (d instanceof _ParentDoc) {
_ParentDoc4DCM pDoc = (_ParentDoc4DCM) d;
totalBetaDenominator += Utils.digamma(m_totalBeta[tid] + pDoc.m_topic_stat[tid]) - digBeta4Tid;
for (int v = 0; v < vocabulary_size; v++) {
wordNum4Tid += pDoc.m_wordTopic_stat[tid][v];
wordNum4Tid4V[v] += pDoc.m_wordTopic_stat[tid][v];
totalBetaNumerator[v] += Utils.digamma(m_beta[tid][v] + pDoc.m_wordTopic_stat[tid][v]);
totalBetaNumerator[v] -= Utils.digamma(m_beta[tid][v]);
}
}
}
for (int v = 0; v < vocabulary_size; v++) {
if (wordNum4Tid == 0)
break;
if (wordNum4Tid4V[v] == 0) {
deltaBeta = 0;
} else {
deltaBeta = totalBetaNumerator[v] / totalBetaDenominator;
}
double newBeta = m_beta[tid][v] * deltaBeta + d_beta;
double t_diff = Math.abs(m_beta[tid][v] - newBeta);
if (t_diff > diff)
diff = t_diff;
m_beta[tid][v] = newBeta;
}
iteration++;
// System.out.println("beta iteration\t"+iteration);
} while (diff > m_newtonConverge);
// System.out.println("beta iteration\t" + iteration);
}
use of structures._ParentDoc4DCM in project IR_Base by Linda-sunshine.
the class DCMCorrLDA method sampleInChildDoc.
protected void sampleInChildDoc(_ChildDoc d) {
int wid, tid;
double normalizedProb;
_ParentDoc4DCM pDoc = (_ParentDoc4DCM) d.m_parentDoc;
for (_Word w : d.getWords()) {
tid = w.getTopic();
wid = w.getIndex();
pDoc.m_wordTopic_stat[tid][wid]--;
pDoc.m_topic_stat[tid]--;
d.m_sstat[tid]--;
normalizedProb = 0;
for (tid = 0; tid < number_of_topics; tid++) {
double pWordTopic = childWordByTopicProb(tid, wid, pDoc);
double pTopic = childTopicInDocProb(tid, d, pDoc);
m_topicProbCache[tid] = pWordTopic * pTopic;
normalizedProb += m_topicProbCache[tid];
}
normalizedProb *= m_rand.nextDouble();
for (tid = 0; tid < m_topicProbCache.length; tid++) {
normalizedProb -= m_topicProbCache[tid];
if (normalizedProb <= 0)
break;
}
if (tid == m_topicProbCache.length)
tid--;
w.setTopic(tid);
d.m_sstat[tid]++;
pDoc.m_topic_stat[tid]++;
pDoc.m_wordTopic_stat[tid][wid]++;
}
}
use of structures._ParentDoc4DCM in project IR_Base by Linda-sunshine.
the class DCMCorrLDA_multi_E_test method printWordTopicDistribution.
protected void printWordTopicDistribution(_Doc d, File wordTopicDistributionFolder, int k) {
_ParentDoc4DCM pDoc = (_ParentDoc4DCM) d;
String wordTopicDistributionFile = pDoc.getName() + ".txt";
try {
PrintWriter pw = new PrintWriter(new File(wordTopicDistributionFolder, wordTopicDistributionFile));
for (int i = 0; i < number_of_topics; i++) {
MyPriorityQueue<_RankItem> fVector = new MyPriorityQueue<_RankItem>(k);
for (int v = 0; v < vocabulary_size; v++) {
String featureName = m_corpus.getFeature(v);
double wordProb = pDoc.m_wordTopic_prob[i][v];
_RankItem ri = new _RankItem(featureName, wordProb);
fVector.add(ri);
}
pw.format("Topic %d(%.5f):\t", i, pDoc.m_topics[i]);
for (_RankItem it : fVector) pw.format("%s(%.5f)\t", it.m_name, m_logSpace ? Math.exp(it.m_value) : it.m_value);
pw.write("\n");
}
pw.flush();
pw.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
use of structures._ParentDoc4DCM in project IR_Base by Linda-sunshine.
the class DCMLDA4AC method sampleInParentDoc.
protected void sampleInParentDoc(_Doc d) {
_ParentDoc4DCM pDoc = (_ParentDoc4DCM) d;
int wid, tid;
double normalizedProb;
for (_Word w : pDoc.getWords()) {
tid = w.getTopic();
wid = w.getIndex();
pDoc.m_sstat[tid]--;
pDoc.m_topic_stat[tid]--;
pDoc.m_wordTopic_stat[tid][wid]--;
if (!m_collectCorpusStats)
word_topic_sstat[tid][wid]--;
normalizedProb = 0;
for (tid = 0; tid < number_of_topics; tid++) {
double pWordTopic = wordTopicProb(tid, wid, pDoc);
double pTopicPDoc = topicInDocProb(tid, pDoc);
m_topicProbCache[tid] = pWordTopic * pTopicPDoc;
normalizedProb += m_topicProbCache[tid];
}
normalizedProb *= m_rand.nextDouble();
for (tid = 0; tid < number_of_topics; tid++) {
normalizedProb -= m_topicProbCache[tid];
if (normalizedProb <= 0)
break;
}
if (tid == number_of_topics)
tid--;
w.setTopic(tid);
pDoc.m_sstat[tid]++;
pDoc.m_topic_stat[tid]++;
pDoc.m_wordTopic_stat[tid][wid]++;
if (!m_collectCorpusStats)
word_topic_sstat[tid][wid]++;
}
}
use of structures._ParentDoc4DCM in project IR_Base by Linda-sunshine.
the class DCMLDA4AC method sampleInChildDoc.
protected void sampleInChildDoc(_ChildDoc d) {
int wid, tid;
double normalizedProb;
_ParentDoc4DCM pDoc = (_ParentDoc4DCM) d.m_parentDoc;
for (_Word w : d.getWords()) {
tid = w.getTopic();
wid = w.getIndex();
pDoc.m_wordTopic_stat[tid][wid]--;
pDoc.m_topic_stat[tid]--;
d.m_sstat[tid]--;
if (!m_collectCorpusStats)
word_topic_sstat[tid][wid]--;
normalizedProb = 0;
for (tid = 0; tid < number_of_topics; tid++) {
double pWordTopic = wordTopicProb(tid, wid, pDoc);
double pTopic = topicInDocProb(tid, d);
m_topicProbCache[tid] = pWordTopic * pTopic;
normalizedProb += m_topicProbCache[tid];
}
normalizedProb *= m_rand.nextDouble();
for (tid = 0; tid < m_topicProbCache.length; tid++) {
normalizedProb -= m_topicProbCache[tid];
if (normalizedProb <= 0)
break;
}
if (tid == m_topicProbCache.length)
tid--;
w.setTopic(tid);
d.m_sstat[tid]++;
pDoc.m_topic_stat[tid]++;
pDoc.m_wordTopic_stat[tid][wid]++;
if (!m_collectCorpusStats)
word_topic_sstat[tid][wid]--;
}
}
Aggregations