use of structures._Word in project IR_Base by Linda-sunshine.
the class LDA_Gibbs method initialize_probability.
@Override
protected void initialize_probability(Collection<_Doc> collection) {
for (int i = 0; i < number_of_topics; i++) Arrays.fill(word_topic_sstat[i], d_beta);
Arrays.fill(m_sstat, d_beta * vocabulary_size);
Arrays.fill(m_topicProbCache, 0);
// initialize topic-word allocation, p(w|z)
for (_Doc d : collection) {
// allocate memory and randomize it
d.setTopics4Gibbs(number_of_topics, d_alpha);
for (_Word w : d.getWords()) {
word_topic_sstat[w.getTopic()][w.getIndex()]++;
m_sstat[w.getTopic()]++;
}
}
imposePrior();
}
use of structures._Word in project IR_Base by Linda-sunshine.
the class LDA_Gibbs method calculate_log_likelihood.
@Override
protected double calculate_log_likelihood(_Doc d) {
int tid, wid;
double logLikelihood = docThetaLikelihood(d), docSum = Utils.sumOfArray(d.m_sstat);
for (_Word w : d.getWords()) {
wid = w.getIndex();
tid = w.getTopic();
logLikelihood += Math.log(d.m_sstat[tid] / docSum * word_topic_sstat[tid][wid] / m_sstat[tid]);
}
return logLikelihood;
}
use of structures._Word in project IR_Base by Linda-sunshine.
the class LDA_Gibbs method calculate_E_step.
@Override
public double calculate_E_step(_Doc d) {
d.permutation();
double p;
int wid, tid;
for (_Word w : d.getWords()) {
wid = w.getIndex();
tid = w.getTopic();
// remove the word's topic assignment
d.m_sstat[tid]--;
if (m_collectCorpusStats) {
word_topic_sstat[tid][wid]--;
m_sstat[tid]--;
}
// perform random sampling
p = 0;
for (tid = 0; tid < number_of_topics; tid++) {
m_topicProbCache[tid] = topicInDocProb(tid, d) * wordByTopicProb(tid, wid);
p += m_topicProbCache[tid];
}
p *= m_rand.nextDouble();
tid = -1;
while (p > 0 && tid < number_of_topics - 1) {
tid++;
p -= m_topicProbCache[tid];
}
// assign the selected topic to word
w.setTopic(tid);
d.m_sstat[tid]++;
if (m_collectCorpusStats) {
word_topic_sstat[tid][wid]++;
m_sstat[tid]++;
}
}
return 0;
// if (m_collectCorpusStats == false || m_converge>0)
// return 0;
// // return calculate_log_likelihood(d);
// else
// return 0;
}
use of structures._Word in project IR_Base by Linda-sunshine.
the class LDA_Gibbs_test method cal_logLikelihood_partial.
protected double cal_logLikelihood_partial(_Doc d) {
double docLogLikelihood = 0;
for (_Word w : d.getTestWords()) {
int wid = w.getIndex();
double wordLogLikelihood = 0;
for (int k = 0; k < number_of_topics; k++) {
double wordPerTopicLikelihood = d.m_topics[k] * topic_term_probabilty[k][wid];
wordLogLikelihood += wordPerTopicLikelihood;
}
docLogLikelihood += Math.log(wordLogLikelihood);
}
return docLogLikelihood;
}
use of structures._Word in project IR_Base by Linda-sunshine.
the class sparseLDA method sampleTopicAssignment.
protected void sampleTopicAssignment(_Doc4SparseDCMLDA DCMDoc) {
int wid, tid;
double p;
for (_Word w : DCMDoc.getWords()) {
wid = w.getIndex();
tid = w.getTopic();
DCMDoc.m_sstat[tid]--;
if (m_collectCorpusStats) {
word_topic_sstat[tid][wid]--;
m_sstat[tid]--;
}
p = 0;
// why do we need this? isn't it a constant?
double denominator = DCMDoc.m_alphaDoc + Utils.sumOfArray(DCMDoc.m_sstat);
for (tid = 0; tid < number_of_topics; tid++) {
m_topicProbCache[tid] = 0;
if (DCMDoc.m_topicIndicator[tid] == false)
continue;
m_topicProbCache[tid] = topicInDocProb(tid, denominator, DCMDoc) * wordByTopicProb(tid, wid);
p += m_topicProbCache[tid];
}
p *= m_rand.nextDouble();
tid = 0;
while (p > 0 && tid < number_of_topics - 1) {
p -= m_topicProbCache[tid];
tid++;
}
w.setTopic(tid);
DCMDoc.m_sstat[tid]++;
if (m_collectCorpusStats) {
word_topic_sstat[tid][wid]++;
m_sstat[tid]++;
}
}
}
Aggregations