Search in sources :

Example 6 with structures._ParentDoc

use of structures._ParentDoc in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC_test method rankChild4ParentByLikelihood.

protected double rankChild4ParentByLikelihood(_ChildDoc cDoc, _ParentDoc pDoc) {
    int cDocLen = cDoc.getTotalDocLength();
    _SparseFeature[] fv = pDoc.getSparse();
    double docLogLikelihood = 0;
    for (_SparseFeature i : fv) {
        int wid = i.getIndex();
        double value = i.getValue();
        double wordLogLikelihood = 0;
        for (int k = 0; k < number_of_topics; k++) {
            double wordPerTopicLikelihood = (word_topic_sstat[k][wid] / m_sstat[k]) * ((cDoc.m_sstat[k] + d_alpha) / (d_alpha * number_of_topics + cDocLen));
            wordLogLikelihood += wordPerTopicLikelihood;
        }
        docLogLikelihood += value * Math.log(wordLogLikelihood);
    }
    return docLogLikelihood;
}
Also used : structures._SparseFeature(structures._SparseFeature)

Example 7 with structures._ParentDoc

use of structures._ParentDoc in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC_test method discoverSpecificComments.

protected void discoverSpecificComments(String similarityFile) {
    System.out.println("topic similarity");
    try {
        PrintWriter pw = new PrintWriter(new File(similarityFile));
        for (_Doc doc : m_trainSet) {
            if (doc instanceof _ParentDoc) {
                pw.print(doc.getName() + "\t");
                double stnTopicSimilarity = 0.0;
                double docTopicSimilarity = 0.0;
                for (_ChildDoc cDoc : ((_ParentDoc) doc).m_childDocs) {
                    pw.print(cDoc.getName() + ":");
                    docTopicSimilarity = computeSimilarity(((_ParentDoc) doc).m_topics, cDoc.m_topics);
                    pw.print(docTopicSimilarity);
                    for (_Stn stnObj : doc.getSentences()) {
                        stnTopicSimilarity = computeSimilarity(stnObj.m_topics, cDoc.m_topics);
                        pw.print(":" + (stnObj.getIndex() + 1) + ":" + stnTopicSimilarity);
                    }
                    pw.print("\t");
                }
                pw.println();
            }
        }
        pw.close();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    }
}
Also used : structures._ChildDoc(structures._ChildDoc) structures._Stn(structures._Stn) structures._Doc(structures._Doc) structures._ParentDoc(structures._ParentDoc) FileNotFoundException(java.io.FileNotFoundException) File(java.io.File) PrintWriter(java.io.PrintWriter)

Example 8 with structures._ParentDoc

use of structures._ParentDoc in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC_test method separateTrainTest4Spam.

public void separateTrainTest4Spam() {
    int cvFold = 10;
    ArrayList<String> parentFakeList = new ArrayList<String>();
    String parentFakeString = "448 348 294 329 317 212 327 127 262 148 307 139 40 325 224 234 233 430 357 78 191 150 424 206 125 484 293 73 456 111 141 68 106 183 215 402 209 159 34 156 280 265 458 65 32 118 352 105 404 66";
    String[] parentFakeStringArray = parentFakeString.split(" ");
    for (String parentName : parentFakeStringArray) {
        parentFakeList.add(parentName);
        System.out.println("parent Name\t" + parentName);
    }
    ArrayList<_Doc> parentTrainSet = new ArrayList<_Doc>();
    double avgCommentNum = 0;
    m_trainSet = new ArrayList<_Doc>();
    m_testSet = new ArrayList<_Doc>();
    for (_Doc d : m_corpus.getCollection()) {
        if (d instanceof _ParentDoc) {
            String parentName = d.getName();
            if (parentFakeList.contains(parentName)) {
                m_testSet.add(d);
                avgCommentNum += ((_ParentDoc) d).m_childDocs.size();
            } else {
                parentTrainSet.add(d);
            }
        }
    }
    System.out.println("avg comments for parent doc in testSet\t" + avgCommentNum * 1.0 / m_testSet.size());
    for (_Doc d : parentTrainSet) {
        _ParentDoc pDoc = (_ParentDoc) d;
        m_trainSet.add(d);
        for (_ChildDoc cDoc : pDoc.m_childDocs) {
            m_trainSet.add(cDoc);
        }
    }
    System.out.println("m_testSet size\t" + m_testSet.size());
    System.out.println("m_trainSet size\t" + m_trainSet.size());
}
Also used : structures._ChildDoc(structures._ChildDoc) structures._Doc(structures._Doc) structures._ParentDoc(structures._ParentDoc) ArrayList(java.util.ArrayList)

Example 9 with structures._ParentDoc

use of structures._ParentDoc in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC_test method rankChild4StnByLanguageModel.

protected HashMap<String, Double> rankChild4StnByLanguageModel(_Stn stnObj, _ParentDoc pDoc) {
    HashMap<String, Double> childLikelihoodMap = new HashMap<String, Double>();
    double smoothingMu = m_LM.m_smoothingMu;
    for (_ChildDoc cDoc : pDoc.m_childDocs) {
        int cDocLen = cDoc.getTotalDocLength();
        _SparseFeature[] fv = cDoc.getSparse();
        double stnLogLikelihood = 0;
        double alphaDoc = smoothingMu / (smoothingMu + cDocLen);
        _SparseFeature[] sv = stnObj.getFv();
        for (_SparseFeature svWord : sv) {
            double featureLikelihood = 0;
            int wid = svWord.getIndex();
            double stnVal = svWord.getValue();
            int featureIndex = Utils.indexOf(fv, wid);
            double docVal = 0;
            if (featureIndex != -1) {
                docVal = fv[featureIndex].getValue();
            }
            double smoothingProb = (1 - alphaDoc) * docVal / (cDocLen);
            smoothingProb += alphaDoc * m_LM.getReferenceProb(wid);
            featureLikelihood = Math.log(smoothingProb);
            stnLogLikelihood += stnVal * featureLikelihood;
        }
        childLikelihoodMap.put(cDoc.getName(), stnLogLikelihood);
    }
    return childLikelihoodMap;
}
Also used : structures._ChildDoc(structures._ChildDoc) HashMap(java.util.HashMap) structures._SparseFeature(structures._SparseFeature)

Example 10 with structures._ParentDoc

use of structures._ParentDoc in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC_test method initTest4Spam.

public void initTest4Spam(ArrayList<_Doc> sampleTestSet, _Doc d) {
    _ParentDoc pDoc = (_ParentDoc) d;
    pDoc.setTopics4Gibbs(number_of_topics, 0);
    for (_Stn stnObj : pDoc.getSentences()) {
        stnObj.setTopicsVct(number_of_topics);
    }
    sampleTestSet.add(pDoc);
    for (_ChildDoc cDoc : pDoc.m_childDocs) {
        cDoc.setTopics4Gibbs_LDA(number_of_topics, d_alpha);
        sampleTestSet.add(cDoc);
        cDoc.setParentDoc(pDoc);
    }
}
Also used : structures._Stn(structures._Stn) structures._ChildDoc(structures._ChildDoc) structures._ParentDoc(structures._ParentDoc)

Aggregations

structures._ParentDoc (structures._ParentDoc)72 structures._ChildDoc (structures._ChildDoc)50 structures._Doc (structures._Doc)39 structures._Stn (structures._Stn)30 File (java.io.File)29 PrintWriter (java.io.PrintWriter)22 FileNotFoundException (java.io.FileNotFoundException)20 HashMap (java.util.HashMap)17 structures._Word (structures._Word)17 structures._SparseFeature (structures._SparseFeature)14 structures._ChildDoc4BaseWithPhi (structures._ChildDoc4BaseWithPhi)8 Map (java.util.Map)7 ArrayList (java.util.ArrayList)6 structures._ParentDoc4DCM (structures._ParentDoc4DCM)4 IOException (java.io.IOException)2 ParseException (java.text.ParseException)2 JSONObject (json.JSONObject)2 Feature (Classifier.supervised.liblinear.Feature)1 FeatureNode (Classifier.supervised.liblinear.FeatureNode)1 Model (Classifier.supervised.liblinear.Model)1