use of structures._Doc4DCMLDA in project IR_Base by Linda-sunshine.
the class DCMLDA method initialize_probability.
@Override
protected void initialize_probability(Collection<_Doc> collection) {
m_alpha = new double[number_of_topics];
m_beta = new double[number_of_topics][vocabulary_size];
m_totalAlpha = 0;
m_totalBeta = new double[number_of_topics];
m_alphaAuxilary = new double[number_of_topics];
for (_Doc d : collection) {
((_Doc4DCMLDA) d).setTopics4Gibbs(number_of_topics, 0, vocabulary_size);
// allocate memory and randomize it
// ((_ChildDoc) d).setTopics4Gibbs_LDA(number_of_topics, 0);
}
initialAlphaBeta();
imposePrior();
}
use of structures._Doc4DCMLDA in project IR_Base by Linda-sunshine.
the class DCMLDA method calculate_log_likelihood.
protected double calculate_log_likelihood(_Doc d) {
double docLogLikelihood = 0.0;
_Doc4DCMLDA DCMDoc = (_Doc4DCMLDA) d;
for (int k = 0; k < number_of_topics; k++) {
double term = Utils.lgamma(d.m_sstat[k] + m_alpha[k]);
docLogLikelihood += term;
term = Utils.lgamma(m_alpha[k]);
docLogLikelihood -= term;
}
docLogLikelihood += Utils.lgamma(m_totalAlpha);
docLogLikelihood -= Utils.lgamma(d.getTotalDocLength() + m_totalAlpha);
for (int k = 0; k < number_of_topics; k++) {
for (int v = 0; v < vocabulary_size; v++) {
double term = Utils.lgamma(DCMDoc.m_wordTopic_stat[k][v] + m_beta[k][v]);
docLogLikelihood += term;
term = Utils.lgamma(m_beta[k][v]);
docLogLikelihood -= term;
}
docLogLikelihood += Utils.lgamma(m_totalBeta[k]);
docLogLikelihood -= Utils.lgamma(d.m_sstat[k] + m_totalBeta[k]);
}
return docLogLikelihood;
}
use of structures._Doc4DCMLDA in project IR_Base by Linda-sunshine.
the class DCMLDA method initialAlphaBeta.
protected void initialAlphaBeta() {
Arrays.fill(m_sstat, 0);
Arrays.fill(m_alphaAuxilary, 0);
for (int k = 0; k < number_of_topics; k++) Arrays.fill(topic_term_probabilty[k], 0);
for (_Doc d : m_trainSet) {
_Doc4DCMLDA DCMDoc = (_Doc4DCMLDA) d;
for (int k = 0; k < number_of_topics; k++) {
double tempProb = d.m_sstat[k] / d.getTotalDocLength();
m_sstat[k] += tempProb;
m_alphaAuxilary[k] += tempProb * tempProb;
if (DCMDoc.m_sstat[k] == 0)
continue;
for (int v = 0; v < vocabulary_size; v++) {
tempProb = DCMDoc.m_wordTopic_stat[k][v] / DCMDoc.m_sstat[k];
topic_term_probabilty[k][v] += tempProb;
}
}
}
int trainSetSize = m_trainSet.size();
for (int k = 0; k < number_of_topics; k++) {
m_sstat[k] /= trainSetSize;
m_alphaAuxilary[k] /= trainSetSize;
for (int v = 0; v < vocabulary_size; v++) {
topic_term_probabilty[k][v] /= trainSetSize;
}
}
for (int k = 0; k < number_of_topics; k++) {
m_alpha[k] = m_sstat[k] + d_alpha;
for (int v = 0; v < vocabulary_size; v++) {
m_beta[k][v] = topic_term_probabilty[k][v] + d_beta;
}
}
m_totalAlpha = Utils.sumOfArray(m_alpha);
for (int k = 0; k < number_of_topics; k++) m_totalBeta[k] = Utils.sumOfArray(m_beta[k]);
}
use of structures._Doc4DCMLDA in project IR_Base by Linda-sunshine.
the class DCMLDA method initTestDoc.
@Override
public void initTestDoc(_Doc d) {
_Doc4DCMLDA DCMDoc = (_Doc4DCMLDA) d;
for (int k = 0; k < number_of_topics; k++) {
Arrays.fill(DCMDoc.m_wordTopic_prob[k], 0);
}
int testLength = (int) (m_testWord4PerplexityProportion * DCMDoc.getTotalDocLength());
DCMDoc.setTopics4GibbsTest(number_of_topics, 0, testLength);
for (_Word w : DCMDoc.getWords()) {
int wid = w.getIndex();
int tid = w.getTopic();
DCMDoc.m_wordTopic_stat[tid][wid]++;
}
}
use of structures._Doc4DCMLDA in project IR_Base by Linda-sunshine.
the class DCMLDA method calculate_E_step.
@Override
public double calculate_E_step(_Doc d) {
_Doc4DCMLDA DCMDoc = (_Doc4DCMLDA) d;
DCMDoc.permutation();
double p;
int wid, tid;
for (_Word w : DCMDoc.getWords()) {
wid = w.getIndex();
tid = w.getTopic();
// remove the word's topic assignment
DCMDoc.m_sstat[tid]--;
DCMDoc.m_wordTopic_stat[tid][wid]--;
if (m_collectCorpusStats)
word_topic_sstat[tid][wid]--;
// perform random sampling
p = 0;
for (tid = 0; tid < number_of_topics; tid++) {
m_topicProbCache[tid] = topicInDocProb(tid, DCMDoc) * wordTopicProb(tid, wid, DCMDoc);
p += m_topicProbCache[tid];
}
p *= m_rand.nextDouble();
tid = 0;
while (p > 0 && tid < number_of_topics - 1) {
p -= m_topicProbCache[tid];
tid++;
}
// assign the selected topic to word
w.setTopic(tid);
DCMDoc.m_sstat[tid]++;
DCMDoc.m_wordTopic_stat[tid][wid]++;
if (m_collectCorpusStats)
word_topic_sstat[tid][wid]++;
}
return 0;
}
Aggregations