Search in sources :

Example 6 with structures._Word

use of structures._Word in project IR_Base by Linda-sunshine.

the class sparseClusterDCMLDA method sampleTopicAssignment.

protected void sampleTopicAssignment(_Doc4SparseDCMLDA DCMDoc) {
    int wid, tid;
    double p;
    int clusterIndex = DCMDoc.m_clusterIndicator;
    for (_Word w : DCMDoc.getWords()) {
        wid = w.getIndex();
        tid = w.getTopic();
        DCMDoc.m_sstat[tid]--;
        DCMDoc.m_wordTopic_stat[tid][wid]--;
        if (m_collectCorpusStats) {
            word_topic_sstat[tid][wid]--;
            m_clusterTopicStats[clusterIndex][tid]--;
            m_clusterTopicWordStats[clusterIndex][tid][wid]--;
        }
        p = 0;
        double denominator = 0;
        denominator += DCMDoc.m_alphaDoc;
        denominator += Utils.sumOfArray(DCMDoc.m_sstat);
        for (tid = 0; tid < number_of_topics; tid++) {
            m_topicProbCache[tid] = 0;
            if (DCMDoc.m_topicIndicator[tid] == false)
                continue;
            double term1 = 0;
            term1 = topicInDocProb(tid, denominator, DCMDoc);
            if (term1 < 0) {
                System.out.println("negative1\t" + term1);
            }
            term1 = wordTopicProb(tid, wid, clusterIndex);
            if (term1 < 0) {
                System.out.println("negative2\t" + term1);
            }
            m_topicProbCache[tid] = topicInDocProb(tid, denominator, DCMDoc) * wordTopicProb(tid, wid, clusterIndex);
            p += m_topicProbCache[tid];
        }
        p *= m_rand.nextDouble();
        tid = -1;
        if (p <= 0) {
            // for(int k=0; k<number_of_topics; k++)
            // System.out.println(m_alpha[k]+"\t"+m_totalBeta[k]);
            System.out.println(p + "\t" + DCMDoc.getName() + "\t" + DCMDoc.m_indicatorTrue_stat);
        }
        while (p > 0 && tid < number_of_topics - 1) {
            tid++;
            p -= m_topicProbCache[tid];
        }
        w.setTopic(tid);
        DCMDoc.m_sstat[tid]++;
        DCMDoc.m_wordTopic_stat[tid][wid]++;
        if (m_collectCorpusStats) {
            word_topic_sstat[tid][wid]++;
            m_clusterTopicStats[clusterIndex][tid]++;
            m_clusterTopicWordStats[clusterIndex][tid][wid]++;
        }
    }
}
Also used : structures._Word(structures._Word)

Example 7 with structures._Word

use of structures._Word in project IR_Base by Linda-sunshine.

the class sparseClusterDCMLDA method sampleClusterIndex.

public void sampleClusterIndex(_Doc d) {
    _Doc4SparseDCMLDA DCMDoc = (_Doc4SparseDCMLDA) d;
    double p = 0;
    Arrays.fill(m_clusterSamplingCache, 0);
    int clusterIndex = DCMDoc.m_clusterIndicator;
    m_clusterStats[clusterIndex]--;
    for (_Word w : DCMDoc.getWords()) {
        int wid = w.getIndex();
        int tid = w.getTopic();
        m_clusterTopicWordStats[clusterIndex][tid][wid]--;
        m_clusterTopicStats[clusterIndex][tid]--;
    }
    double avgClusterProb = 0;
    double maxClusterProb = 0;
    double minClusterProb = 0;
    for (int c = 0; c < m_clusterNum; c++) {
        double term1 = wordByClusterProb(d, c);
        term1 = clusterProb(c);
        m_clusterSamplingCache[c] = wordByClusterProb(d, c) + clusterProb(c);
        avgClusterProb = (avgClusterProb * (c) + m_clusterSamplingCache[c]) / ((c + 1) * 1.0);
        if (c == 0)
            maxClusterProb = m_clusterSamplingCache[c];
        else if (m_clusterSamplingCache[c] > maxClusterProb)
            maxClusterProb = m_clusterSamplingCache[c];
    }
    boolean overflowFlag = false;
    for (int c = 0; c < m_clusterNum; c++) {
        m_clusterSamplingCache[c] = Math.exp(m_clusterSamplingCache[c] - maxClusterProb);
        if (m_clusterSamplingCache[c] > Double.MAX_VALUE) {
            clusterIndex = c;
            System.out.println("maximum overflow\t" + Double.MAX_VALUE);
            overflowFlag = true;
            break;
        } else {
            if (m_clusterSamplingCache[c] < Double.MIN_VALUE) {
                m_clusterSamplingCache[c] = 0;
            }
        }
        p += m_clusterSamplingCache[c];
    }
    if (overflowFlag) {
        DCMDoc.m_clusterIndicator = clusterIndex;
        m_clusterStats[clusterIndex]++;
        for (_Word w : DCMDoc.getWords()) {
            int wid = w.getIndex();
            int tid = w.getTopic();
            m_clusterTopicWordStats[clusterIndex][tid][wid]++;
            m_clusterTopicStats[clusterIndex][tid]++;
        }
        return;
    }
    p *= m_rand.nextDouble();
    for (clusterIndex = 0; clusterIndex < m_clusterNum; clusterIndex++) {
        p -= m_clusterSamplingCache[clusterIndex];
        if (p <= 0) {
            break;
        }
    }
    if (clusterIndex >= m_clusterNum) {
        System.out.println("p\t" + p);
        for (int c = 0; c < m_clusterNum; c++) System.out.println("c\t" + m_clusterSamplingCache[c]);
    }
    DCMDoc.m_clusterIndicator = clusterIndex;
    m_clusterStats[clusterIndex]++;
    for (_Word w : DCMDoc.getWords()) {
        int wid = w.getIndex();
        int tid = w.getTopic();
        m_clusterTopicWordStats[clusterIndex][tid][wid]++;
        m_clusterTopicStats[clusterIndex][tid]++;
    }
}
Also used : structures._Doc4SparseDCMLDA(structures._Doc4SparseDCMLDA) structures._Word(structures._Word)

Example 8 with structures._Word

use of structures._Word in project IR_Base by Linda-sunshine.

the class sparseClusterDCMLDA_test method printParentTopicAssignment.

protected void printParentTopicAssignment(_Doc d, File topicFolder) {
    _Doc4SparseDCMLDA DCMDoc = (_Doc4SparseDCMLDA) d;
    String topicAssignmentFile = DCMDoc.getName() + ".txt";
    try {
        PrintWriter pw = new PrintWriter(new File(topicFolder, topicAssignmentFile));
        pw.println("cluster\t" + DCMDoc.m_clusterIndicator);
        for (_Word w : DCMDoc.getWords()) {
            int index = w.getIndex();
            int topic = w.getTopic();
            String featureName = m_corpus.getFeature(index);
            pw.print(featureName + ":" + topic + "\t");
        }
        pw.flush();
        pw.close();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) structures._Doc4SparseDCMLDA(structures._Doc4SparseDCMLDA) structures._Word(structures._Word) File(java.io.File) PrintWriter(java.io.PrintWriter)

Example 9 with structures._Word

use of structures._Word in project IR_Base by Linda-sunshine.

the class sparseDCMLDA method sampleTopicAssignment.

protected void sampleTopicAssignment(_Doc4SparseDCMLDA DCMDoc) {
    int wid, tid;
    double p;
    for (_Word w : DCMDoc.getWords()) {
        wid = w.getIndex();
        tid = w.getTopic();
        DCMDoc.m_sstat[tid]--;
        DCMDoc.m_wordTopic_stat[tid][wid]--;
        if (m_collectCorpusStats)
            word_topic_sstat[tid][wid]--;
        p = 0;
        double denominator = 0;
        denominator += DCMDoc.m_alphaDoc;
        denominator += Utils.sumOfArray(DCMDoc.m_sstat);
        for (tid = 0; tid < number_of_topics; tid++) {
            m_topicProbCache[tid] = 0;
            if (DCMDoc.m_topicIndicator[tid] == false)
                continue;
            m_topicProbCache[tid] = topicInDocProb(tid, denominator, DCMDoc) * wordTopicProb(tid, wid, DCMDoc);
            if (m_topicProbCache[tid] < 0)
                System.out.println("negative\t" + m_topicProbCache[tid]);
            p += m_topicProbCache[tid];
        }
        p *= m_rand.nextDouble();
        tid = 0;
        while (p > 0 && tid < number_of_topics - 1) {
            p -= m_topicProbCache[tid];
            tid++;
        }
        w.setTopic(tid);
        DCMDoc.m_sstat[tid]++;
        DCMDoc.m_wordTopic_stat[tid][wid]++;
        if (m_collectCorpusStats)
            word_topic_sstat[tid][wid]++;
    }
}
Also used : structures._Word(structures._Word)

Example 10 with structures._Word

use of structures._Word in project IR_Base by Linda-sunshine.

the class sparseDCMLDA_test method printParentTopicAssignment.

protected void printParentTopicAssignment(_Doc d, File topicFolder) {
    String topicAssignmentFile = d.getName() + ".txt";
    try {
        PrintWriter pw = new PrintWriter(new File(topicFolder, topicAssignmentFile));
        for (_Word w : d.getWords()) {
            int index = w.getIndex();
            int topic = w.getTopic();
            String featureName = m_corpus.getFeature(index);
            pw.print(featureName + ":" + topic + "\t");
        }
        pw.flush();
        pw.close();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) structures._Word(structures._Word) File(java.io.File) PrintWriter(java.io.PrintWriter)

Aggregations

structures._Word (structures._Word)69 structures._ChildDoc (structures._ChildDoc)18 File (java.io.File)16 FileNotFoundException (java.io.FileNotFoundException)15 PrintWriter (java.io.PrintWriter)15 structures._ParentDoc (structures._ParentDoc)14 structures._Doc (structures._Doc)12 structures._Stn (structures._Stn)11 structures._ParentDoc4DCM (structures._ParentDoc4DCM)10 structures._ChildDoc4BaseWithPhi (structures._ChildDoc4BaseWithPhi)9 HashMap (java.util.HashMap)5 structures._Doc4DCMLDA (structures._Doc4DCMLDA)4 structures._Doc4SparseDCMLDA (structures._Doc4SparseDCMLDA)4 structures._SparseFeature (structures._SparseFeature)3 Feature (Classifier.supervised.liblinear.Feature)1 FeatureNode (Classifier.supervised.liblinear.FeatureNode)1 Model (Classifier.supervised.liblinear.Model)1 Parameter (Classifier.supervised.liblinear.Parameter)1 Problem (Classifier.supervised.liblinear.Problem)1 SolverType (Classifier.supervised.liblinear.SolverType)1