use of structures._Stn in project IR_Base by Linda-sunshine.
the class DocAnalyzer method AnalyzeDocByStn.
protected boolean AnalyzeDocByStn(_Doc doc, String[] sentences) {
TokenizeResult result;
int y = doc.getYLabel(), index = 0;
// Collect the index and counts of features.
HashMap<Integer, Double> spVct = new HashMap<Integer, Double>();
// sparse sentence feature vectors
ArrayList<_Stn> stnList = new ArrayList<_Stn>();
double stopwordCnt = 0, rawCnt = 0;
for (String sentence : sentences) {
// Three-step analysis.
result = TokenizerNormalizeStemmer(sentence);
// construct bag-of-word vector based on normalized tokens
HashMap<Integer, Double> sentence_vector = constructSpVct(result.getTokens(), y, spVct);
if (sentence_vector.size() > 2) {
// avoid empty sentence
String[] posTags;
if (m_tagger == null)
posTags = null;
else
posTags = m_tagger.tag(result.getRawTokens());
stnList.add(new _Stn(index, Utils.createSpVct(sentence_vector), result.getRawTokens(), posTags, sentence));
Utils.mergeVectors(sentence_vector, spVct);
stopwordCnt += result.getStopwordCnt();
rawCnt += result.getRawCnt();
}
index++;
}
// the document should be long enough
if (spVct.size() >= m_lengthThreshold && stnList.size() >= m_stnSizeThreshold) {
doc.createSpVct(spVct);
doc.setStopwordProportion(stopwordCnt / rawCnt);
doc.setSentences(stnList);
m_corpus.addDoc(doc);
m_classMemberNo[y]++;
if (m_releaseContent)
doc.clearSource();
return true;
} else {
/**
**Roll back here!!*****
*/
rollBack(spVct, y);
return false;
}
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class LDAGibbs4AC method initTest.
protected void initTest(ArrayList<_Doc> sampleTestSet, _Doc d) {
_ParentDoc pDoc = (_ParentDoc) d;
for (_Stn stnObj : pDoc.getSentences()) {
stnObj.setTopicsVct(number_of_topics);
}
int testLength = 0;
pDoc.setTopics4GibbsTest(number_of_topics, d_alpha, testLength);
sampleTestSet.add(pDoc);
pDoc.createSparseVct4Infer();
for (_ChildDoc cDoc : pDoc.m_childDocs) {
testLength = (int) (m_testWord4PerplexityProportion * cDoc.getTotalDocLength());
cDoc.setTopics4GibbsTest(number_of_topics, d_alpha, testLength);
sampleTestSet.add(cDoc);
cDoc.createSparseVct4Infer();
}
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class LDAGibbs4AC_test method printTopKChild4StnWithHybridPro.
protected void printTopKChild4StnWithHybridPro(String filePrefix, int topK) {
String topKChild4StnFile = filePrefix + "topChild4Stn_hybridPro.txt";
try {
PrintWriter pw = new PrintWriter(new File(topKChild4StnFile));
m_LM.generateReferenceModel();
for (_Doc d : m_trainSet) {
if (d instanceof _ParentDoc) {
_ParentDoc pDoc = (_ParentDoc) d;
pw.println(pDoc.getName() + "\t" + pDoc.getSenetenceSize());
for (_Stn stnObj : pDoc.getSentences()) {
HashMap<String, Double> likelihoodMap = rankChild4StnByHybridPro(stnObj, pDoc);
pw.print((stnObj.getIndex() + 1) + "\t");
for (Map.Entry<String, Double> e : sortHashMap4String(likelihoodMap, true)) {
pw.print(e.getKey());
pw.print(":" + e.getValue());
pw.print("\t");
}
pw.println();
}
}
}
pw.flush();
pw.close();
} catch (Exception e) {
e.printStackTrace();
}
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class LDAGibbs4AC_test method rankChild4StnByHybridPro.
protected HashMap<String, Double> rankChild4StnByHybridPro(_Stn stnObj, _ParentDoc pDoc) {
HashMap<String, Double> childLikelihoodMap = new HashMap<String, Double>();
double smoothingMu = m_LM.m_smoothingMu;
for (_ChildDoc cDoc : pDoc.m_childDocs) {
double cDocLen = cDoc.getTotalDocLength();
double stnLogLikelihood = 0;
double alphaDoc = smoothingMu / (smoothingMu + cDocLen);
_SparseFeature[] fv = cDoc.getSparse();
_SparseFeature[] sv = stnObj.getFv();
for (_SparseFeature svWord : sv) {
double wordLikelihood = 0;
int wid = svWord.getIndex();
double stnVal = svWord.getValue();
int featureIndex = Utils.indexOf(fv, wid);
double docVal = 0;
if (featureIndex != -1) {
docVal = fv[featureIndex].getValue();
}
double LMLikelihood = (1 - alphaDoc) * docVal / cDocLen;
LMLikelihood += alphaDoc * m_LM.getReferenceProb(wid);
double TMLikelihood = 0;
for (int k = 0; k < number_of_topics; k++) {
double wordPerTopicLikelihood = (word_topic_sstat[k][wid] / m_sstat[k]) * (topicInDocProb(k, cDoc) / (d_alpha * number_of_topics + cDocLen));
TMLikelihood += wordPerTopicLikelihood;
}
wordLikelihood = m_tau * LMLikelihood + (1 - m_tau) * TMLikelihood;
wordLikelihood = Math.log(wordLikelihood);
stnLogLikelihood += stnVal * wordLikelihood;
}
double cosineSim = computeSimilarity(stnObj.m_topics, cDoc.m_topics);
stnLogLikelihood = m_tau * stnLogLikelihood + (1 - m_tau) * cosineSim;
childLikelihoodMap.put(cDoc.getName(), stnLogLikelihood);
}
return childLikelihoodMap;
}
use of structures._Stn in project IR_Base by Linda-sunshine.
the class LDAGibbs4AC_test method printParentTopicAssignment.
protected void printParentTopicAssignment(_Doc d, File topicFolder) {
// System.out.println("printing topic assignment parent documents");
String topicAssignmentFile = d.getName() + ".txt";
try {
PrintWriter pw = new PrintWriter(new File(topicFolder, topicAssignmentFile));
for (_Stn stnObj : d.getSentences()) {
pw.print(stnObj.getIndex() + "\t");
for (_Word w : stnObj.getWords()) {
int index = w.getIndex();
int topic = w.getTopic();
String featureName = m_corpus.getFeature(index);
// System.out.println("test\t"+featureName+"\tdocName\t"+d.getName());
pw.print(featureName + ":" + topic + "\t");
}
pw.println();
}
pw.flush();
pw.close();
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Aggregations