Search in sources :

Example 6 with structures._ChildDoc

use of structures._ChildDoc in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC_test method separateTrainTest4Spam.

public void separateTrainTest4Spam() {
    int cvFold = 10;
    ArrayList<String> parentFakeList = new ArrayList<String>();
    String parentFakeString = "448 348 294 329 317 212 327 127 262 148 307 139 40 325 224 234 233 430 357 78 191 150 424 206 125 484 293 73 456 111 141 68 106 183 215 402 209 159 34 156 280 265 458 65 32 118 352 105 404 66";
    String[] parentFakeStringArray = parentFakeString.split(" ");
    for (String parentName : parentFakeStringArray) {
        parentFakeList.add(parentName);
        System.out.println("parent Name\t" + parentName);
    }
    ArrayList<_Doc> parentTrainSet = new ArrayList<_Doc>();
    double avgCommentNum = 0;
    m_trainSet = new ArrayList<_Doc>();
    m_testSet = new ArrayList<_Doc>();
    for (_Doc d : m_corpus.getCollection()) {
        if (d instanceof _ParentDoc) {
            String parentName = d.getName();
            if (parentFakeList.contains(parentName)) {
                m_testSet.add(d);
                avgCommentNum += ((_ParentDoc) d).m_childDocs.size();
            } else {
                parentTrainSet.add(d);
            }
        }
    }
    System.out.println("avg comments for parent doc in testSet\t" + avgCommentNum * 1.0 / m_testSet.size());
    for (_Doc d : parentTrainSet) {
        _ParentDoc pDoc = (_ParentDoc) d;
        m_trainSet.add(d);
        for (_ChildDoc cDoc : pDoc.m_childDocs) {
            m_trainSet.add(cDoc);
        }
    }
    System.out.println("m_testSet size\t" + m_testSet.size());
    System.out.println("m_trainSet size\t" + m_trainSet.size());
}
Also used : structures._ChildDoc(structures._ChildDoc) structures._Doc(structures._Doc) structures._ParentDoc(structures._ParentDoc) ArrayList(java.util.ArrayList)

Example 7 with structures._ChildDoc

use of structures._ChildDoc in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC_test method rankChild4StnByLanguageModel.

protected HashMap<String, Double> rankChild4StnByLanguageModel(_Stn stnObj, _ParentDoc pDoc) {
    HashMap<String, Double> childLikelihoodMap = new HashMap<String, Double>();
    double smoothingMu = m_LM.m_smoothingMu;
    for (_ChildDoc cDoc : pDoc.m_childDocs) {
        int cDocLen = cDoc.getTotalDocLength();
        _SparseFeature[] fv = cDoc.getSparse();
        double stnLogLikelihood = 0;
        double alphaDoc = smoothingMu / (smoothingMu + cDocLen);
        _SparseFeature[] sv = stnObj.getFv();
        for (_SparseFeature svWord : sv) {
            double featureLikelihood = 0;
            int wid = svWord.getIndex();
            double stnVal = svWord.getValue();
            int featureIndex = Utils.indexOf(fv, wid);
            double docVal = 0;
            if (featureIndex != -1) {
                docVal = fv[featureIndex].getValue();
            }
            double smoothingProb = (1 - alphaDoc) * docVal / (cDocLen);
            smoothingProb += alphaDoc * m_LM.getReferenceProb(wid);
            featureLikelihood = Math.log(smoothingProb);
            stnLogLikelihood += stnVal * featureLikelihood;
        }
        childLikelihoodMap.put(cDoc.getName(), stnLogLikelihood);
    }
    return childLikelihoodMap;
}
Also used : structures._ChildDoc(structures._ChildDoc) HashMap(java.util.HashMap) structures._SparseFeature(structures._SparseFeature)

Example 8 with structures._ChildDoc

use of structures._ChildDoc in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC_test method initTest4Spam.

public void initTest4Spam(ArrayList<_Doc> sampleTestSet, _Doc d) {
    _ParentDoc pDoc = (_ParentDoc) d;
    pDoc.setTopics4Gibbs(number_of_topics, 0);
    for (_Stn stnObj : pDoc.getSentences()) {
        stnObj.setTopicsVct(number_of_topics);
    }
    sampleTestSet.add(pDoc);
    for (_ChildDoc cDoc : pDoc.m_childDocs) {
        cDoc.setTopics4Gibbs_LDA(number_of_topics, d_alpha);
        sampleTestSet.add(cDoc);
        cDoc.setParentDoc(pDoc);
    }
}
Also used : structures._Stn(structures._Stn) structures._ChildDoc(structures._ChildDoc) structures._ParentDoc(structures._ParentDoc)

Example 9 with structures._ChildDoc

use of structures._ChildDoc in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC_test method printTopKStn4Child.

protected void printTopKStn4Child(String filePrefix, int topK) {
    String topKStn4ChildFile = filePrefix + "topStn4Child.txt";
    try {
        PrintWriter pw = new PrintWriter(new File(topKStn4ChildFile));
        for (_Doc d : m_trainSet) {
            if (d instanceof _ParentDoc) {
                _ParentDoc pDoc = (_ParentDoc) d;
                pw.println(pDoc.getName() + "\t" + pDoc.m_childDocs.size());
                for (_ChildDoc cDoc : pDoc.m_childDocs) {
                    HashMap<Integer, Double> stnSimMap = rankStn4ChildBySim(pDoc, cDoc);
                    int i = 0;
                    pw.print(cDoc.getName() + "\t");
                    for (Map.Entry<Integer, Double> e : sortHashMap4Integer(stnSimMap, true)) {
                        // if(i==topK)
                        // break;
                        pw.print(e.getKey());
                        pw.print(":" + e.getValue());
                        pw.print("\t");
                        i++;
                    }
                    pw.println();
                }
            }
        }
        pw.flush();
        pw.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
}
Also used : FileNotFoundException(java.io.FileNotFoundException) structures._ChildDoc(structures._ChildDoc) structures._Doc(structures._Doc) structures._ParentDoc(structures._ParentDoc) File(java.io.File) HashMap(java.util.HashMap) Map(java.util.Map) PrintWriter(java.io.PrintWriter)

Example 10 with structures._ChildDoc

use of structures._ChildDoc in project IR_Base by Linda-sunshine.

the class LDAGibbs4AC_test method initTest4Dynamical.

// dynamical add comments to sampleTest
public void initTest4Dynamical(ArrayList<_Doc> sampleTestSet, _Doc d, int commentNum) {
    _ParentDoc pDoc = (_ParentDoc) d;
    pDoc.m_childDocs4Dynamic = new ArrayList<_ChildDoc>();
    pDoc.setTopics4Gibbs(number_of_topics, d_alpha);
    for (_Stn stnObj : pDoc.getSentences()) {
        stnObj.setTopicsVct(number_of_topics);
    }
    sampleTestSet.add(pDoc);
    int count = 0;
    for (_ChildDoc cDoc : pDoc.m_childDocs) {
        if (count >= commentNum) {
            break;
        }
        count++;
        cDoc.setTopics4Gibbs_LDA(number_of_topics, d_alpha);
        sampleTestSet.add(cDoc);
        pDoc.addChildDoc4Dynamics(cDoc);
    }
}
Also used : structures._ChildDoc(structures._ChildDoc) structures._Stn(structures._Stn) structures._ParentDoc(structures._ParentDoc)

Aggregations

structures._ChildDoc (structures._ChildDoc)77 structures._ParentDoc (structures._ParentDoc)47 structures._Doc (structures._Doc)35 structures._Stn (structures._Stn)25 structures._Word (structures._Word)22 File (java.io.File)18 structures._ParentDoc4DCM (structures._ParentDoc4DCM)16 structures._SparseFeature (structures._SparseFeature)16 HashMap (java.util.HashMap)14 PrintWriter (java.io.PrintWriter)12 FileNotFoundException (java.io.FileNotFoundException)11 structures._ChildDoc4BaseWithPhi (structures._ChildDoc4BaseWithPhi)6 ArrayList (java.util.ArrayList)5 Map (java.util.Map)2 Feature (Classifier.supervised.liblinear.Feature)1 FeatureNode (Classifier.supervised.liblinear.FeatureNode)1 Model (Classifier.supervised.liblinear.Model)1 Parameter (Classifier.supervised.liblinear.Parameter)1 Problem (Classifier.supervised.liblinear.Problem)1 SolverType (Classifier.supervised.liblinear.SolverType)1