Search in sources :

Example 6 with structures._Doc4DCMLDA

use of structures._Doc4DCMLDA in project IR_Base by Linda-sunshine.

the class DCMLDA method initialize_probability.

@Override
protected void initialize_probability(Collection<_Doc> collection) {
    m_alpha = new double[number_of_topics];
    m_beta = new double[number_of_topics][vocabulary_size];
    m_totalAlpha = 0;
    m_totalBeta = new double[number_of_topics];
    m_alphaAuxilary = new double[number_of_topics];
    for (_Doc d : collection) {
        ((_Doc4DCMLDA) d).setTopics4Gibbs(number_of_topics, 0, vocabulary_size);
    // allocate memory and randomize it
    // ((_ChildDoc) d).setTopics4Gibbs_LDA(number_of_topics, 0);
    }
    initialAlphaBeta();
    imposePrior();
}
Also used : structures._Doc4DCMLDA(structures._Doc4DCMLDA) structures._Doc(structures._Doc)

Example 7 with structures._Doc4DCMLDA

use of structures._Doc4DCMLDA in project IR_Base by Linda-sunshine.

the class DCMLDA method calculate_log_likelihood.

protected double calculate_log_likelihood(_Doc d) {
    double docLogLikelihood = 0.0;
    _Doc4DCMLDA DCMDoc = (_Doc4DCMLDA) d;
    for (int k = 0; k < number_of_topics; k++) {
        double term = Utils.lgamma(d.m_sstat[k] + m_alpha[k]);
        docLogLikelihood += term;
        term = Utils.lgamma(m_alpha[k]);
        docLogLikelihood -= term;
    }
    docLogLikelihood += Utils.lgamma(m_totalAlpha);
    docLogLikelihood -= Utils.lgamma(d.getTotalDocLength() + m_totalAlpha);
    for (int k = 0; k < number_of_topics; k++) {
        for (int v = 0; v < vocabulary_size; v++) {
            double term = Utils.lgamma(DCMDoc.m_wordTopic_stat[k][v] + m_beta[k][v]);
            docLogLikelihood += term;
            term = Utils.lgamma(m_beta[k][v]);
            docLogLikelihood -= term;
        }
        docLogLikelihood += Utils.lgamma(m_totalBeta[k]);
        docLogLikelihood -= Utils.lgamma(d.m_sstat[k] + m_totalBeta[k]);
    }
    return docLogLikelihood;
}
Also used : structures._Doc4DCMLDA(structures._Doc4DCMLDA)

Example 8 with structures._Doc4DCMLDA

use of structures._Doc4DCMLDA in project IR_Base by Linda-sunshine.

the class DCMLDA method initialAlphaBeta.

protected void initialAlphaBeta() {
    Arrays.fill(m_sstat, 0);
    Arrays.fill(m_alphaAuxilary, 0);
    for (int k = 0; k < number_of_topics; k++) Arrays.fill(topic_term_probabilty[k], 0);
    for (_Doc d : m_trainSet) {
        _Doc4DCMLDA DCMDoc = (_Doc4DCMLDA) d;
        for (int k = 0; k < number_of_topics; k++) {
            double tempProb = d.m_sstat[k] / d.getTotalDocLength();
            m_sstat[k] += tempProb;
            m_alphaAuxilary[k] += tempProb * tempProb;
            if (DCMDoc.m_sstat[k] == 0)
                continue;
            for (int v = 0; v < vocabulary_size; v++) {
                tempProb = DCMDoc.m_wordTopic_stat[k][v] / DCMDoc.m_sstat[k];
                topic_term_probabilty[k][v] += tempProb;
            }
        }
    }
    int trainSetSize = m_trainSet.size();
    for (int k = 0; k < number_of_topics; k++) {
        m_sstat[k] /= trainSetSize;
        m_alphaAuxilary[k] /= trainSetSize;
        for (int v = 0; v < vocabulary_size; v++) {
            topic_term_probabilty[k][v] /= trainSetSize;
        }
    }
    for (int k = 0; k < number_of_topics; k++) {
        m_alpha[k] = m_sstat[k] + d_alpha;
        for (int v = 0; v < vocabulary_size; v++) {
            m_beta[k][v] = topic_term_probabilty[k][v] + d_beta;
        }
    }
    m_totalAlpha = Utils.sumOfArray(m_alpha);
    for (int k = 0; k < number_of_topics; k++) m_totalBeta[k] = Utils.sumOfArray(m_beta[k]);
}
Also used : structures._Doc4DCMLDA(structures._Doc4DCMLDA) structures._Doc(structures._Doc)

Example 9 with structures._Doc4DCMLDA

use of structures._Doc4DCMLDA in project IR_Base by Linda-sunshine.

the class DCMLDA method initTestDoc.

@Override
public void initTestDoc(_Doc d) {
    _Doc4DCMLDA DCMDoc = (_Doc4DCMLDA) d;
    for (int k = 0; k < number_of_topics; k++) {
        Arrays.fill(DCMDoc.m_wordTopic_prob[k], 0);
    }
    int testLength = (int) (m_testWord4PerplexityProportion * DCMDoc.getTotalDocLength());
    DCMDoc.setTopics4GibbsTest(number_of_topics, 0, testLength);
    for (_Word w : DCMDoc.getWords()) {
        int wid = w.getIndex();
        int tid = w.getTopic();
        DCMDoc.m_wordTopic_stat[tid][wid]++;
    }
}
Also used : structures._Doc4DCMLDA(structures._Doc4DCMLDA) structures._Word(structures._Word)

Example 10 with structures._Doc4DCMLDA

use of structures._Doc4DCMLDA in project IR_Base by Linda-sunshine.

the class DCMLDA method calculate_E_step.

@Override
public double calculate_E_step(_Doc d) {
    _Doc4DCMLDA DCMDoc = (_Doc4DCMLDA) d;
    DCMDoc.permutation();
    double p;
    int wid, tid;
    for (_Word w : DCMDoc.getWords()) {
        wid = w.getIndex();
        tid = w.getTopic();
        // remove the word's topic assignment
        DCMDoc.m_sstat[tid]--;
        DCMDoc.m_wordTopic_stat[tid][wid]--;
        if (m_collectCorpusStats)
            word_topic_sstat[tid][wid]--;
        // perform random sampling
        p = 0;
        for (tid = 0; tid < number_of_topics; tid++) {
            m_topicProbCache[tid] = topicInDocProb(tid, DCMDoc) * wordTopicProb(tid, wid, DCMDoc);
            p += m_topicProbCache[tid];
        }
        p *= m_rand.nextDouble();
        tid = 0;
        while (p > 0 && tid < number_of_topics - 1) {
            p -= m_topicProbCache[tid];
            tid++;
        }
        // assign the selected topic to word
        w.setTopic(tid);
        DCMDoc.m_sstat[tid]++;
        DCMDoc.m_wordTopic_stat[tid][wid]++;
        if (m_collectCorpusStats)
            word_topic_sstat[tid][wid]++;
    }
    return 0;
}
Also used : structures._Doc4DCMLDA(structures._Doc4DCMLDA) structures._Word(structures._Word)

Aggregations

structures._Doc4DCMLDA (structures._Doc4DCMLDA)12 structures._Doc (structures._Doc)4 structures._Word (structures._Word)4 File (java.io.File)2 FileNotFoundException (java.io.FileNotFoundException)2 PrintWriter (java.io.PrintWriter)2 MyPriorityQueue (structures.MyPriorityQueue)2 structures._RankItem (structures._RankItem)2